diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -110,12 +110,6 @@ "gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">; // Types for all sparse handles. -def GPU_SparseEnvHandle : - DialectType($_self)">, - "sparse environment handle type">, - BuildableType<"mlir::gpu::SparseEnvHandleType::get($_builder.getContext())">; - def GPU_SparseDnTensorHandle : DialectType($_self)">, diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -165,7 +165,7 @@ void addAsyncDependency(Operation *op, Value token); // Handle types for sparse. -enum class SparseHandleKind { Env, SpMat, DnTensor }; +enum class SparseHandleKind { SpMat, DnTensor }; template class SparseHandleType @@ -176,7 +176,6 @@ using Base::Base; }; -using SparseEnvHandleType = SparseHandleType; using SparseDnTensorHandleType = SparseHandleType; using SparseSpMatHandleType = SparseHandleType; diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1540,63 +1540,6 @@ // Operation on sparse matrices, called from the host // (currently lowers to cuSparse for CUDA only, no ROCM lowering). // - -def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> { - let summary = "Create sparse environment operation"; - let description = [{ - The `gpu.create_sparse_env` operation initializes a sparse environment. - It must be executed prior to any other sparse operation. The operation - returns a handle to the new sparse environment. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %env, %token = gpu.create_sparse_env async [%dep] - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies); - let results = (outs Res:$env, - Optional:$asyncToken); - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) attr-dict - }]; -} - -def GPU_DestroySparseEnvOp : GPU_Op< - "destroy_sparse_env", - [GPU_AsyncOpInterface]> { - let summary = "Destroy sparse environment operation"; - let description = [{ - The `gpu.destroy_sparse_env` operation releases all resources of a sparse - environment represented by a handle that was previously created by a - `gpu.create_sparse_env` operation. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %token = gpu.destroy_sparse_env async [%dep] %env - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies, - Arg:$env); - let results = (outs Optional:$asyncToken); - - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) - $env attr-dict - }]; -} - def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "Create dense tensor operation"; let description = [{ @@ -1612,19 +1555,18 @@ Example: ```mlir - %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref + %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, AnyMemRef:$memref, Variadic:$dims); let results = (outs Res:$dnTensor, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) + $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) }]; } @@ -1788,12 +1730,11 @@ Example: ```mlir - %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref + %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, Index:$rows, Index:$cols, AnyMemRef:$memref); @@ -1802,7 +1743,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $rows `,` $cols `,` $memref attr-dict `:` type($memref) }]; } @@ -1877,11 +1818,10 @@ Example: ```mlir - %buffersz, %token = gpu.spmv_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY into f32 + %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1894,7 +1834,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, @@ -1902,12 +1841,12 @@ , [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, spmatA, dnX, dnY, computeType);}]> + modeA, spmatA, dnX, dnY, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType }]; } @@ -1930,11 +1869,10 @@ Example: ```mlir - %token = gpu.spmv async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 + %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1946,20 +1884,19 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, "Type":$computeType, "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, spmatA, dnX, dnY, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } @@ -1982,12 +1919,11 @@ Example: ```mlir - %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 + %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2001,7 +1937,6 @@ "Type":$bufferSzs, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2009,12 +1944,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, - env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> + modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType }]; } @@ -2037,12 +1972,11 @@ Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 + %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2055,7 +1989,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2063,13 +1996,13 @@ "ValueRange":$buffers), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType }]; } @@ -2088,7 +2021,7 @@ Example: ```mlir - %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 + %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 ``` The matrix arguments can also be associated with one of the following @@ -2097,7 +2030,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2110,7 +2042,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2118,12 +2049,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> + modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType }]; } @@ -2142,7 +2073,7 @@ Example: ```mlir - %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 + %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 ``` The matrix arguments can also be associated with one of the following @@ -2151,7 +2082,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2164,7 +2094,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2172,13 +2101,13 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -204,14 +204,6 @@ "mgpuSetDefaultDevice", llvmVoidType, {llvmInt32Type /* uint32_t devIndex */}}; - FunctionCallBuilder createSparseEnvCallBuilder = { - "mgpuCreateSparseEnv", - llvmPointerType, - {llvmPointerType /* void *stream */}}; - FunctionCallBuilder destroySparseEnvCallBuilder = { - "mgpuDestroySparseEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createDnVecCallBuilder = { "mgpuCreateDnVec", llvmPointerType, @@ -255,51 +247,40 @@ FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMBufferSizeCallBuilder = { "mgpuSpMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMCallBuilder = { "mgpuSpMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMBufferSizeCallBuilder = { "mgpuSDDMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMCallBuilder = { "mgpuSDDMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder createSparseLtEnvCallBuilder = { - "mgpuCreateSparseLtEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder destroySparseLtEnvCallBuilder = { - "mgpuDestroySparseLtEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createLtDnMatCallBuilder = { "mgpuCreateCuSparseLtDnMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder destroyCuSparseLtSpMatBuilder = { "mgpuDestroyCuSparseLtSpMat", llvmVoidType, @@ -311,20 +292,19 @@ FunctionCallBuilder create2To4SpMatCallBuilder = { "mgpuCusparseLtCreate2To4SpMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCuSparseLtSpMMBufferSizeBuilder = { "mgpuCuSparseLtSpMMBufferSize", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, - llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createCuSparseLtSpMMBuilder = { "mgpuCuSparseLtSpMM", llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType /*void *stream*/}}; + llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -515,34 +495,6 @@ ConversionPatternRewriter &rewriter) const override; }; -class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertCreateSparseEnvOpToGpuRuntimeCallPattern( - LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern( - typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - -class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertDestroySparseEnvOpToGpuRuntimeCallPattern( - LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern( - typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - class ConvertCreateDnTensorOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -1393,55 +1345,6 @@ static_cast(TValue)); } -LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::CreateSparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - // Use the cusparseLt create call if the dnmat is used with spmat with - // 2:4 sparsity - Value handle; - if (isSpMMCusparseLtOp(op.getEnv())) { - // CUDA runner asserts the size is 11024 bytes. - auto handleSz = rewriter.create( - loc, getIndexType(), rewriter.getIndexAttr(11024)); - handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); - handle = rewriter.create(loc, llvmPointerType, handle); - createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream}) - .getResult(); - } else { - handle = - createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); - } - rewriter.replaceOp(op, {handle, stream}); - return success(); -} - -LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::DestroySparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - // Use the cusparseLt destroy call if the dnmat is used with spmat with - // 2:4 sparsity - if (isSpMMCusparseLtOp(op.getEnv())) { - destroySparseLtEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); - } else { - destroySparseEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); - } - rewriter.replaceOp(op, {stream}); - return success(); -} - LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1471,7 +1374,6 @@ // the dnmat is used with spmat with 2:4 sparsity if (dims.size() == 2) { if (isSpMMCusparseLtOp(op.getDnTensor())) { - auto envHandle = adaptor.getEnv(); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1480,7 +1382,7 @@ createLtDnMatCallBuilder .create(loc, rewriter, - {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream}) + {handle, dims[0], dims[1], pTensor, dtp, stream}) .getResult(); } else { handle = @@ -1648,7 +1550,6 @@ Type dType = llvm::cast(op.getMemref().getType()).getElementType(); auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); - auto envHandle = adaptor.getEnv(); // CUDA runner asserts the size is 44104 bytes. auto handleSz = rewriter.create( @@ -1659,8 +1560,7 @@ create2To4SpMatCallBuilder .create(loc, rewriter, - {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, - dtp, stream}) + {handle, adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); @@ -1697,12 +1597,11 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = - spMVBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, stream}) - .getResult(); + auto bufferSize = spMVBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1723,9 +1622,8 @@ if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, pBuf, - stream}); + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } @@ -1750,9 +1648,8 @@ llvmInt64Type, three); createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, - {bufferSize, adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), - computeType, stream}) + {bufferSize, modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream}) .getResult(); auto bufferSizePtr1 = rewriter.create( @@ -1774,12 +1671,12 @@ } else { auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); - bufferSize = createSpMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), computeType, stream}) - .getResult(); + bufferSize = + createSpMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), + adaptor.getDnmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); } return success(); @@ -1797,12 +1694,12 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = createSDDMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, stream}) - .getResult(); + auto bufferSize = + createSDDMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), + adaptor.getSpmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1832,17 +1729,17 @@ } createCuSparseLtSpMMBuilder.create( loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), pBufs[0], pBufs[1], pBufs[2], stream}); + {adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), + pBufs[0], pBufs[1], pBufs[2], stream}); } else { Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) .allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSpMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, pBuf, stream}); + createSpMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, pBuf, stream}); } rewriter.replaceOp(op, {stream}); return success(); @@ -1872,10 +1769,10 @@ MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSDDMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, pBuf, stream}); + createSDDMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), + adaptor.getDnmatB(), adaptor.getSpmatC(), + computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } @@ -1887,7 +1784,6 @@ addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); - addOpaquePointerConversion(converter); patterns.add(); addTypes(); - addTypes(); addTypes(); addTypes(); addOperations< @@ -162,8 +161,6 @@ static std::string getSparseHandleKeyword(SparseHandleKind kind) { switch (kind) { - case SparseHandleKind::Env: - return "sparse.env_handle"; case SparseHandleKind::DnTensor: return "sparse.dntensor_handle"; case SparseHandleKind::SpMat: @@ -216,8 +213,6 @@ shape, elementType, operand); } - if (keyword == getSparseHandleKeyword(SparseHandleKind::Env)) - return SparseEnvHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor)) return SparseDnTensorHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat)) @@ -231,8 +226,6 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { TypeSwitch(type) .Case([&](Type) { os << "async.token"; }) - .Case( - [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::Env); }) .Case([&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnTensor); }) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -36,7 +36,6 @@ //===----------------------------------------------------------------------===// // Helper methods. //===----------------------------------------------------------------------===// - /// Marks the given top module as a GPU container module. static void markAsGPUContainer(ModuleOp topModule) { topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), @@ -494,26 +493,21 @@ // Create sparse environment and sparse matrix/dense vector handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spmatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA, rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dvecX = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX); + loc, dnTensorHandleTp, tokenTp, token, vecX, szX); Value dnX = dvecX.getResult(0); token = dvecX.getAsyncToken(); auto dvecY = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY); + loc, dnTensorHandleTp, tokenTp, token, vecY, szY); Value dnY = dvecY.getResult(0); token = dvecY.getAsyncToken(); @@ -521,7 +515,7 @@ // Precompute buffersize for SpMV. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY, + loc, indexTp, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -530,9 +524,8 @@ token = buf.getAsyncToken(); // Perform the SpMV. - auto spmvComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnX, - dnY, /*computeType=*/dnYType, buffer); + auto spmvComp = rewriter.create( + loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer); token = spmvComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -542,8 +535,6 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnY) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) token = genDeallocMemRef(rewriter, loc, colA, token); @@ -601,27 +592,22 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA, rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dmatB = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matB, + loc, dnTensorHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); auto dmatC = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matC, + loc, dnTensorHandleTp, tokenTp, token, matC, SmallVector{szm, szn}); Value dnC = dmatC.getResult(0); token = dmatC.getAsyncToken(); @@ -630,7 +616,7 @@ // Precompute buffersize for SpMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC, + loc, indexTp, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dmatCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -641,9 +627,8 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Perform the SpMM. - auto spmmComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnB, - dnC, /*computeType=*/dnCType, buffer); + auto spmmComp = rewriter.create( + loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer); token = spmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -653,9 +638,6 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); - token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) token = genDeallocMemRef(rewriter, loc, colA, token); token = genDeallocMemRef(rewriter, loc, valA, token); @@ -715,24 +697,16 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnMatHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); - auto dmatA = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matA, - SmallVector{szm, szk}); + loc, dnMatHandleTp, tokenTp, token, matA, SmallVector{szm, szk}); Value dnA = dmatA.getResult(0); token = dmatA.getAsyncToken(); auto dmatB = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matB, - SmallVector{szk, szn}); + loc, dnMatHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); @@ -745,7 +719,7 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Precompute buffersize for SDDMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType); + loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -753,8 +727,8 @@ token = buf.getAsyncToken(); // Perform the SDDMM. - auto sddmmComp = rewriter.create( - loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer); + auto sddmmComp = rewriter.create(loc, tokenTp, token, dnA, dnB, + spMatC, dnCType, buffer); token = sddmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -764,8 +738,6 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, spMatC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, buffer, token); token = genDeallocMemRef(rewriter, loc, matA, token); token = genDeallocMemRef(rewriter, loc, matB, token); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -79,6 +79,22 @@ ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } }; +// Note that (1) Nvidia confirms the safety to share handle across multiple +// instances, and streams. (2) Clients are responsible to call the @mgpu +// environment initialization/destruction in a thread-safe manner, e.g., +// at the beginning of the program before multi-threads are created. +#ifdef MLIR_ENABLE_CUDA_CUSPARSE +static cusparseHandle_t cusparse_env = nullptr; + +#ifdef MLIR_ENABLE_CUDA_CUSPARSELT +// cusparseLtHandle_t is not a pointer type, so we need an additional flag to +// indicate whether it is initialized. +static cusparseLtHandle_t cusparseLt_env; +static bool cusparseLt_initiated = false; + +#endif // MLIR_ENABLE_CUDA_CUSPARSELT +#endif // MLIR_ENABLE_CUDA_CUSPARSE + extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { ScopedContext scopedContext; CUmodule module = nullptr; @@ -270,17 +286,19 @@ (beta##p) = reinterpret_cast(&(beta##d)); \ } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * -mgpuCreateSparseEnv(CUstream /*stream*/) { - cusparseHandle_t handle = nullptr; - CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle)) - return reinterpret_cast(handle); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() { + ScopedContext scopedContext; + if (!cusparse_env) { + CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&cusparse_env)); + } + return; } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseEnv(void *h, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle)) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() { + ScopedContext scopedContext; + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); + CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(cusparse_env)); + cusparse_env = nullptr; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * @@ -359,10 +377,9 @@ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp, - CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( + int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) { + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); @@ -370,32 +387,32 @@ cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMV_bufferSize(handle, modeA, alphap, matA, vecX, betap, vecY, - cTp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( + cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp, + CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a, - void *x, void *y, - int32_t ctp, void *buf, +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x, + void *y, int32_t ctp, + void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, alphap, matA, vecX, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(cusparse_env, modeA, alphap, matA, vecX, betap, vecY, cTp, CUSPARSE_SPMV_ALG_DEFAULT, buf)) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -405,15 +422,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -421,16 +439,16 @@ cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(cusparse_env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, buf)) } // TODO: add support to passing alpha and beta as arguments extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -440,15 +458,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + assert(cusparse_env && "client did not call mgpuCreateSparseEnv()"); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -456,8 +475,8 @@ cusparseSpMatDescr_t matC = reinterpret_cast(c); auto cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(cusparse_env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } @@ -487,30 +506,33 @@ static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104); static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { - // note that cuSparseLt still uses cusparseStatus_t - CUSPARSE_REPORT_IF_ERROR( - cusparseLtInit(reinterpret_cast(h))) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() { + ScopedContext scopedContext; + if (!cusparseLt_initiated) { + // Note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env)); + cusparseLt_initiated = true; + } } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() { + ScopedContext scopedContext; + assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&cusparseLt_env)); + cusparseLt_initiated = false; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dtp, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); +mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, + int32_t dtp, CUstream /*stream*/) { + assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); // CusparseLt expects the descriptors to be zero-initialized. memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData)); auto dnmat_handle = reinterpret_cast(dh); auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( - handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, + &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) dnmat_handle->values = values; } @@ -530,29 +552,29 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, +mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols, void *values, int32_t dtp, CUstream /*stream*/) { + assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); auto spmat_handle = reinterpret_cast(sh); // CusparseLt expects the descriptors to be zero-initialized. memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData)); spmat_handle->values = values; - auto handle = reinterpret_cast(h); auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, - dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, + /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW, + CUSPARSELT_SPARSITY_50_PERCENT)) } // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a, - void *b, void *c, int32_t ctp, - CUstream /*stream*/) { +mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, + void *c, int32_t ctp, CUstream /*stream*/) { + assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix - auto handle = reinterpret_cast(h); auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); @@ -565,22 +587,25 @@ cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( - handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat), - &(matC->mat), &(matC->mat), cTp)) + &cusparseLt_env, &(matA->matmul), modeA, modeB, &(matA->mat), + &(matB->mat), &(matC->mat), &(matC->mat), cTp)) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( - handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT)) + &cusparseLt_env, &(matA->alg_sel), &(matA->matmul), + CUSPARSELT_MATMUL_ALG_DEFAULT)) int alg = 0; CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( - handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + &cusparseLt_env, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( - handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) + &cusparseLt_env, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) - CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( + &cusparseLt_env, &(matA->plan), &workspace_size_)) CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( - handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_)) + &cusparseLt_env, &(matA->plan), &compressed_size_, + &compressed_buffer_size_)) + // avoid zero-alloc *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_); *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_); @@ -589,23 +614,23 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace, +mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace, void *dA_compressed, void *dA_compressedBuffer, CUstream stream) { - auto handle = reinterpret_cast(h); + assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); ALPHABETA(CUDA_R_32F, alpha, beta) CUSPARSE_REPORT_IF_ERROR( - cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values), + cusparseLtSpMMACompress(&cusparseLt_env, &(matA->plan), (matA->values), dA_compressed, dA_compressedBuffer, stream)) // TODO: add support to multi-stream execution // Perform the matrix multiplication. D = A*B+C using C==D for now CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed, + cusparseLtMatmul(&cusparseLt_env, &(matA->plan), alphap, dA_compressed, matB->values, betap, matC->values, /*dD*/ matC->values, d_workspace, nullptr, 0)) diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -1,6 +1,8 @@ // RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { + llvm.func @mgpuCreateSparseLtEnv() + llvm.func @mgpuDestroySparseLtEnv() // CHECK-LABEL: func @matmul // CHECK: llvm.call @mgpuStreamCreate @@ -20,15 +22,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 + llvm.call @mgpuCreateSparseLtEnv() : () -> () + %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1: memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseLtEnv() : () -> () + gpu.wait [%token9] return } diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -1,6 +1,8 @@ // RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() // CHECK-LABEL: func @matvec // CHECK: llvm.call @mgpuStreamCreate @@ -20,15 +22,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } @@ -50,15 +52,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } @@ -80,15 +82,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -5,6 +5,9 @@ // RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s module attributes {gpu.container_module} { + + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() // CHECK-LABEL:func @no_args(%{{.*}}: index) func.func @no_args(%sz : index) { @@ -326,38 +329,36 @@ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref // CHECK: gpu.alloc async %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - // CHECK: gpu.create_sparse_env async - %env, %token3 = gpu.create_sparse_env async [%token2] + llvm.call @mgpuCreateSparseEnv() : () -> () // CHECK: gpu.create_coo async - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_csr async %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_dn_tensor async - %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref + %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %mem2, %arg0 : index into memref // CHECK: gpu.spmv_buffer_size async - %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64 + %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec into f64 // CHECK: gpu.spmv async - %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 // CHECK: gpu.create_dn_tensor async - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %mem2, %arg0, %arg0 : index, index into memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64 + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64 // CHECK: gpu.spmm async - %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %token11 = gpu.spmm async [%token10] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 // CHECK: gpu.sddmm_buffer_size async - %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64 + %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat into f64 // CHECK: gpu.sddmm async - %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %token13 = gpu.sddmm async [%token12] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 // CHECK: gpu.destroy_dn_tensor async %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat // CHECK: gpu.destroy_sp_mat async %token15 = gpu.destroy_sp_mat async [%token14] %spmat // CHECK: gpu.destroy_dn_tensor async %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec - // CHECK: gpu.destroy_sparse_env async - %token17 = gpu.destroy_sparse_env async [%token16] %env + llvm.call @mgpuDestroySparseEnv() : () -> () // CHECK: gpu.wait - gpu.wait [%token17] + gpu.wait [%token16] return } } diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -6,29 +6,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}} : index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matvec(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } @@ -36,29 +32,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matmul(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } @@ -66,29 +58,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @sddmm(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -45,19 +45,16 @@ // CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]] // CHECK: %[[VAL_41:.*]] = gpu.wait async -// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index +// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_41]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref -// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref +// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] // CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]] // CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]] -// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]] -// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref -// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref +// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_19]] : memref // CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref // CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref // CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -43,18 +43,16 @@ // CHECK: %[[VAL_37:.*]] = gpu.memcpy async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_32]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_15]], %[[VAL_20]], %[[VAL_25]], %[[VAL_31]], %[[VAL_37]]] // CHECK: %[[VAL_38:.*]] = gpu.wait async -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] -// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref -// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref -// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] +// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_38]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : index into memref +// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : index into memref +// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref -// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref +// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref // CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]] // CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]] // CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]] -// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]] -// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref +// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_13]] : memref // CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref // CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref // CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -53,18 +53,16 @@ // CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_18]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_10]], %[[VAL_15]], %[[VAL_23]], %[[VAL_28]], %[[VAL_33]]] // CHECK: %[[VAL_34:.*]] = gpu.wait async -// CHECK: %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]] -// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_35]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_34]]] %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_csr async {{\[}}%[[VAL_40]]] %[[VAL_3]], %[[VAL_3]], %[[VAL_5]], %[[VAL_21]], %[[VAL_26]], %[[VAL_31]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 // CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.alloc async {{\[}}%[[VAL_44]]] (%[[VAL_43]]) : memref -// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 +// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 // CHECK: %[[VAL_48:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_37]] // CHECK: %[[VAL_49:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_48]]] %[[VAL_39]] // CHECK: %[[VAL_50:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_49]]] %[[VAL_41]] -// CHECK: %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] %[[VAL_35]] -// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_51]]] %[[VAL_45]] : memref +// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_50]]] %[[VAL_45]] : memref // CHECK: %[[VAL_53:.*]] = gpu.dealloc async {{\[}}%[[VAL_52]]] %[[VAL_8]] : memref<8x8xf64> // CHECK: %[[VAL_54:.*]] = gpu.dealloc async {{\[}}%[[VAL_53]]] %[[VAL_13]] : memref<8x8xf64> // CHECK: %[[VAL_55:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_21]] : memref diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -11,6 +11,9 @@ // RUN: | FileCheck %s module { + llvm.func @mgpuCreateSparseLtEnv() + llvm.func @mgpuDestroySparseLtEnv() + func.func @sampled_matmul(%a : memref<16x32xf16>, %b : memref<32x16xf16>, %c : memref<16x16xf16>) { @@ -28,19 +31,17 @@ %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16> %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16> %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16> - %env, %token7 = gpu.create_sparse_env async [%token6] - %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16> - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16> - %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16> - %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 + %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16> + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16> + %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16> + %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref - %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 + %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 %token16 = gpu.destroy_sp_mat async [%token15] %spmat %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat - %token18 = gpu.destroy_sparse_env async [%token17] %env - %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> + %token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16> %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16> %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16> @@ -57,6 +58,7 @@ // using NVidia 2:4 structured sparsity for A. // func.func @main() { + llvm.call @mgpuCreateSparseLtEnv() : () -> () %f0 = arith.constant 0.0 : f16 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -225,7 +227,8 @@ %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x16xf16>, vector<16xf16> vector.print %pc0 : vector<16xf16> } - + + llvm.call @mgpuDestroySparseLtEnv() : () -> () return } } diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -32,6 +32,9 @@ }> module { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() + // Computes C = A x B with A sparse COO. func.func @matmulCOO(%A: tensor<8x8xf32, #SortedCOO>, %B: tensor<8x8xf32>, @@ -85,6 +88,7 @@ // Main driver. // func.func @main() { + llvm.call @mgpuCreateSparseEnv(): () -> () %f0 = arith.constant 0.0 : f32 %f1 = arith.constant 1.0 : f32 @@ -173,6 +177,8 @@ bufferization.dealloc_tensor %Acoo : tensor<8x8xf32, #SortedCOO> bufferization.dealloc_tensor %Acsr : tensor<8x8xf32, #CSR> + llvm.call @mgpuDestroySparseEnv(): () -> () + return } } diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir @@ -32,6 +32,9 @@ }> module { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() + // Compute matrix vector y = Ax on COO with default index coordinates. func.func @matvecCOO(%A: tensor, %x: tensor, %y_in: tensor) -> tensor { %y_out = linalg.matvec @@ -49,6 +52,7 @@ } func.func @main() { + llvm.call @mgpuCreateSparseEnv() : () -> () %f0 = arith.constant 0.0 : f64 %f1 = arith.constant 1.0 : f64 %c0 = arith.constant 0 : index @@ -122,6 +126,8 @@ // Release the resources. bufferization.dealloc_tensor %Acoo : tensor bufferization.dealloc_tensor %Acsr : tensor + + llvm.call @mgpuDestroySparseEnv() : () -> () return } } diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -46,6 +46,9 @@ // runs the resulting code with the JIT compiler. // module { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() + // // A kernel that computes a sampled dense matrix matrix multiplication // using a "spy" function and in-place update of the sampling sparse matrix. @@ -81,6 +84,7 @@ // Main driver. // func.func @entry() { + llvm.call @mgpuCreateSparseEnv() : () -> () %d0 = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -149,6 +153,7 @@ bufferization.dealloc_tensor %0 : tensor bufferization.dealloc_tensor %1 : tensor + llvm.call @mgpuDestroySparseEnv() : () -> () return } }