diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1540,63 +1540,6 @@ // Operation on sparse matrices, called from the host // (currently lowers to cuSparse for CUDA only, no ROCM lowering). // - -def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> { - let summary = "Create sparse environment operation"; - let description = [{ - The `gpu.create_sparse_env` operation initializes a sparse environment. - It must be executed prior to any other sparse operation. The operation - returns a handle to the new sparse environment. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %env, %token = gpu.create_sparse_env async [%dep] - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies); - let results = (outs Res:$env, - Optional:$asyncToken); - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) attr-dict - }]; -} - -def GPU_DestroySparseEnvOp : GPU_Op< - "destroy_sparse_env", - [GPU_AsyncOpInterface]> { - let summary = "Destroy sparse environment operation"; - let description = [{ - The `gpu.destroy_sparse_env` operation releases all resources of a sparse - environment represented by a handle that was previously created by a - `gpu.create_sparse_env` operation. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %token = gpu.destroy_sparse_env async [%dep] %env - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies, - Arg:$env); - let results = (outs Optional:$asyncToken); - - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) - $env attr-dict - }]; -} - def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "Create dense tensor operation"; let description = [{ @@ -1612,19 +1555,18 @@ Example: ```mlir - %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref + %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, AnyMemRef:$memref, Variadic:$dims); let results = (outs Res:$dnTensor, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) + $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) }]; } @@ -1788,12 +1730,11 @@ Example: ```mlir - %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref + %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, Index:$rows, Index:$cols, AnyMemRef:$memref); @@ -1802,7 +1743,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $rows `,` $cols `,` $memref attr-dict `:` type($memref) }]; } @@ -1877,11 +1818,10 @@ Example: ```mlir - %buffersz, %token = gpu.spmv_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY into f32 + %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1894,7 +1834,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, @@ -1902,12 +1841,12 @@ , [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, spmatA, dnX, dnY, computeType);}]> + modeA, spmatA, dnX, dnY, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType }]; } @@ -1930,11 +1869,10 @@ Example: ```mlir - %token = gpu.spmv async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 + %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1946,20 +1884,19 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, "Type":$computeType, "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, spmatA, dnX, dnY, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } @@ -1982,12 +1919,11 @@ Example: ```mlir - %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 + %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2001,7 +1937,6 @@ "Type":$bufferSzs, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2009,12 +1944,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, - env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> + modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType }]; } @@ -2037,12 +1972,11 @@ Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 + %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2055,7 +1989,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2063,13 +1996,13 @@ "ValueRange":$buffers), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType }]; } @@ -2088,7 +2021,7 @@ Example: ```mlir - %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 + %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 ``` The matrix arguments can also be associated with one of the following @@ -2097,7 +2030,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2110,7 +2042,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2118,12 +2049,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> + modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType }]; } @@ -2142,7 +2073,7 @@ Example: ```mlir - %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 + %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 ``` The matrix arguments can also be associated with one of the following @@ -2151,7 +2082,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2164,7 +2094,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2172,13 +2101,13 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -204,14 +204,6 @@ "mgpuSetDefaultDevice", llvmVoidType, {llvmInt32Type /* uint32_t devIndex */}}; - FunctionCallBuilder createSparseEnvCallBuilder = { - "mgpuCreateSparseEnv", - llvmPointerType, - {llvmPointerType /* void *stream */}}; - FunctionCallBuilder destroySparseEnvCallBuilder = { - "mgpuDestroySparseEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createDnVecCallBuilder = { "mgpuCreateDnVec", llvmPointerType, @@ -255,51 +247,40 @@ FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMBufferSizeCallBuilder = { "mgpuSpMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMCallBuilder = { "mgpuSpMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMBufferSizeCallBuilder = { "mgpuSDDMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMCallBuilder = { "mgpuSDDMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder createSparseLtEnvCallBuilder = { - "mgpuCreateSparseLtEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder destroySparseLtEnvCallBuilder = { - "mgpuDestroySparseLtEnv", - llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createLtDnMatCallBuilder = { "mgpuCreateCuSparseLtDnMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder destroyCuSparseLtSpMatBuilder = { "mgpuDestroyCuSparseLtSpMat", llvmVoidType, @@ -311,20 +292,19 @@ FunctionCallBuilder create2To4SpMatCallBuilder = { "mgpuCusparseLtCreate2To4SpMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCuSparseLtSpMMBufferSizeBuilder = { "mgpuCuSparseLtSpMMBufferSize", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, - llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createCuSparseLtSpMMBuilder = { "mgpuCuSparseLtSpMM", llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType /*void *stream*/}}; + llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -515,34 +495,6 @@ ConversionPatternRewriter &rewriter) const override; }; -class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertCreateSparseEnvOpToGpuRuntimeCallPattern( - LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern( - typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - -class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertDestroySparseEnvOpToGpuRuntimeCallPattern( - LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern( - typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - class ConvertCreateDnTensorOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -1393,55 +1345,6 @@ static_cast(TValue)); } -LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::CreateSparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - // Use the cusparseLt create call if the dnmat is used with spmat with - // 2:4 sparsity - Value handle; - if (isSpMMCusparseLtOp(op.getEnv())) { - // CUDA runner asserts the size is 11024 bytes. - auto handleSz = rewriter.create( - loc, getIndexType(), rewriter.getIndexAttr(11024)); - handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); - handle = rewriter.create(loc, llvmPointerType, handle); - createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream}) - .getResult(); - } else { - handle = - createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); - } - rewriter.replaceOp(op, {handle, stream}); - return success(); -} - -LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::DestroySparseEnvOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - // Use the cusparseLt destroy call if the dnmat is used with spmat with - // 2:4 sparsity - if (isSpMMCusparseLtOp(op.getEnv())) { - destroySparseLtEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); - } else { - destroySparseEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); - } - rewriter.replaceOp(op, {stream}); - return success(); -} - LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1471,7 +1374,6 @@ // the dnmat is used with spmat with 2:4 sparsity if (dims.size() == 2) { if (isSpMMCusparseLtOp(op.getDnTensor())) { - auto envHandle = adaptor.getEnv(); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1480,7 +1382,7 @@ createLtDnMatCallBuilder .create(loc, rewriter, - {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream}) + {handle, dims[0], dims[1], pTensor, dtp, stream}) .getResult(); } else { handle = @@ -1648,7 +1550,6 @@ Type dType = llvm::cast(op.getMemref().getType()).getElementType(); auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); - auto envHandle = adaptor.getEnv(); // CUDA runner asserts the size is 44104 bytes. auto handleSz = rewriter.create( @@ -1659,8 +1560,7 @@ create2To4SpMatCallBuilder .create(loc, rewriter, - {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, - dtp, stream}) + {handle, adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); @@ -1697,12 +1597,11 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = - spMVBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, stream}) - .getResult(); + auto bufferSize = spMVBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1723,9 +1622,8 @@ if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, pBuf, - stream}); + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } @@ -1750,9 +1648,8 @@ llvmInt64Type, three); createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, - {bufferSize, adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), - computeType, stream}) + {bufferSize, modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream}) .getResult(); auto bufferSizePtr1 = rewriter.create( @@ -1774,12 +1671,12 @@ } else { auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); - bufferSize = createSpMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), computeType, stream}) - .getResult(); + bufferSize = + createSpMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), + adaptor.getDnmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); } return success(); @@ -1797,12 +1694,12 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = createSDDMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, stream}) - .getResult(); + auto bufferSize = + createSDDMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), + adaptor.getSpmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1832,17 +1729,17 @@ } createCuSparseLtSpMMBuilder.create( loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), pBufs[0], pBufs[1], pBufs[2], stream}); + {adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), + pBufs[0], pBufs[1], pBufs[2], stream}); } else { Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) .allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSpMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, pBuf, stream}); + createSpMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, pBuf, stream}); } rewriter.replaceOp(op, {stream}); return success(); @@ -1872,10 +1769,10 @@ MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSDDMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, pBuf, stream}); + createSDDMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), + adaptor.getDnmatB(), adaptor.getSpmatC(), + computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } @@ -1899,8 +1796,6 @@ ConvertWaitAsyncOpToGpuRuntimeCallPattern, ConvertWaitOpToGpuRuntimeCallPattern, ConvertAsyncYieldToGpuRuntimeCallPattern, - ConvertCreateSparseEnvOpToGpuRuntimeCallPattern, - ConvertDestroySparseEnvOpToGpuRuntimeCallPattern, ConvertCreateDnTensorOpToGpuRuntimeCallPattern, ConvertDestroyDnTensorOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -37,6 +37,31 @@ // Helper methods. //===----------------------------------------------------------------------===// +// Helper method to create llvm.call operation that invokes specific functions. +struct FunctionCallBuilder { + FunctionCallBuilder(StringRef functionName, Type returnType, + ArrayRef argumentTypes) + : functionName(functionName), + functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} + LLVM::CallOp create(Location loc, OpBuilder &builder, + ArrayRef arguments) const; + + StringRef functionName; + LLVM::LLVMFunctionType functionType; +}; + +LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, + ArrayRef arguments) const { + auto module = builder.getBlock()->getParent()->getParentOfType(); + auto function = [&] { + if (auto function = module.lookupSymbol(functionName)) + return function; + return OpBuilder::atBlockEnd(module.getBody()) + .create(loc, functionName, functionType); + }(); + return builder.create(loc, function, arguments); +} + /// Marks the given top module as a GPU container module. static void markAsGPUContainer(ModuleOp topModule) { topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), @@ -458,6 +483,13 @@ /// Match and rewrite SpMV kernel. static LogicalResult rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { + MLIRContext *context = rewriter.getContext(); + Type llvmVoidType = LLVM::LLVMVoidType::get(context); + FunctionCallBuilder createSparseEnvCallBuilder = { + "mgpuCreateSparseEnv", llvmVoidType, {}}; + FunctionCallBuilder destroySparseEnvCallBuilder = { + "mgpuDestroySparseEnv", llvmVoidType, {}}; + Location loc = op.getLoc(); Value a = op.getOperand(0); Value x = op.getOperand(1); @@ -494,26 +526,22 @@ // Create sparse environment and sparse matrix/dense vector handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spmatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); + createSparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); Operation *spGenA = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA, rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dvecX = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX); + loc, dnTensorHandleTp, tokenTp, token, vecX, szX); Value dnX = dvecX.getResult(0); token = dvecX.getAsyncToken(); auto dvecY = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY); + loc, dnTensorHandleTp, tokenTp, token, vecY, szY); Value dnY = dvecY.getResult(0); token = dvecY.getAsyncToken(); @@ -521,7 +549,7 @@ // Precompute buffersize for SpMV. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY, + loc, indexTp, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -530,9 +558,8 @@ token = buf.getAsyncToken(); // Perform the SpMV. - auto spmvComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnX, - dnY, /*computeType=*/dnYType, buffer); + auto spmvComp = rewriter.create( + loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer); token = spmvComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -542,8 +569,7 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnY) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); + destroySparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) token = genDeallocMemRef(rewriter, loc, colA, token); @@ -564,6 +590,13 @@ /// Match and rewrite SpMM kernel. static LogicalResult rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { + MLIRContext *context = rewriter.getContext(); + Type llvmVoidType = LLVM::LLVMVoidType::get(context); + FunctionCallBuilder createSparseEnvCallBuilder = { + "mgpuCreateSparseEnv", llvmVoidType, {}}; + FunctionCallBuilder destroySparseEnvCallBuilder = { + "mgpuDestroySparseEnv", llvmVoidType, {}}; + Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); @@ -601,27 +634,23 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); + createSparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); Operation *spGenA = genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA, rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dmatB = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matB, + loc, dnTensorHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); auto dmatC = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matC, + loc, dnTensorHandleTp, tokenTp, token, matC, SmallVector{szm, szn}); Value dnC = dmatC.getResult(0); token = dmatC.getAsyncToken(); @@ -630,7 +659,7 @@ // Precompute buffersize for SpMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC, + loc, indexTp, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dmatCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -641,9 +670,8 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Perform the SpMM. - auto spmmComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnB, - dnC, /*computeType=*/dnCType, buffer); + auto spmmComp = rewriter.create( + loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer); token = spmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -653,9 +681,7 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); - token = genDeallocMemRef(rewriter, loc, rowA, token); + destroySparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); if (colA) token = genDeallocMemRef(rewriter, loc, colA, token); token = genDeallocMemRef(rewriter, loc, valA, token); @@ -675,6 +701,13 @@ /// Match and rewrite SDDMM kernel. static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { + MLIRContext *context = rewriter.getContext(); + Type llvmVoidType = LLVM::LLVMVoidType::get(context); + FunctionCallBuilder createSparseEnvCallBuilder = { + "mgpuCreateSparseEnv", llvmVoidType, {}}; + FunctionCallBuilder destroySparseEnvCallBuilder = { + "mgpuDestroySparseEnv", llvmVoidType, {}}; + Location loc = op.getLoc(); Value a = op.getOperand(0); Value b = op.getOperand(1); @@ -715,24 +748,18 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnMatHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); - token = env.getAsyncToken(); + createSparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); auto dmatA = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matA, - SmallVector{szm, szk}); + loc, dnMatHandleTp, tokenTp, token, matA, SmallVector{szm, szk}); Value dnA = dmatA.getResult(0); token = dmatA.getAsyncToken(); auto dmatB = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matB, - SmallVector{szk, szn}); + loc, dnMatHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); @@ -745,7 +772,7 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Precompute buffersize for SDDMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType); + loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -753,8 +780,8 @@ token = buf.getAsyncToken(); // Perform the SDDMM. - auto sddmmComp = rewriter.create( - loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer); + auto sddmmComp = rewriter.create(loc, tokenTp, token, dnA, dnB, + spMatC, dnCType, buffer); token = sddmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -764,8 +791,7 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, spMatC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) - .getAsyncToken(); + destroySparseEnvCallBuilder.create(loc, rewriter, {}).getResult(); token = genDeallocMemRef(rewriter, loc, buffer, token); token = genDeallocMemRef(rewriter, loc, matA, token); token = genDeallocMemRef(rewriter, loc, matB, token); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -79,6 +79,34 @@ ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } }; +#ifdef MLIR_ENABLE_CUDA_CUSPARSE +// Create the cusparse handles once for the duration of the instance +class ScopedCuSparseHandleStorage { +public: + static cusparseHandle_t env; + static bool initiated; + ScopedCuSparseHandleStorage() {} + ~ScopedCuSparseHandleStorage() {} +}; + +cusparseHandle_t ScopedCuSparseHandleStorage::env = nullptr; +bool ScopedCuSparseHandleStorage::initiated = false; + +#ifdef MLIR_ENABLE_CUDA_CUSPARSELT +class ScopedCuSparseLtHandleStorage { +public: + static cusparseLtHandle_t env; + static bool initiated; + ScopedCuSparseLtHandleStorage() {} + ~ScopedCuSparseLtHandleStorage() {} +}; + +cusparseLtHandle_t ScopedCuSparseLtHandleStorage::env; +bool ScopedCuSparseLtHandleStorage::initiated = false; + +#endif // MLIR_ENABLE_CUDA_CUSPARSELT +#endif // MLIR_ENABLE_CUDA_CUSPARSE + extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { ScopedContext scopedContext; CUmodule module = nullptr; @@ -270,17 +298,19 @@ (beta##p) = reinterpret_cast(&(beta##d)); \ } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * -mgpuCreateSparseEnv(CUstream /*stream*/) { - cusparseHandle_t handle = nullptr; - CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle)) - return reinterpret_cast(handle); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv() { + ScopedCuSparseHandleStorage hstorage; + if (!hstorage.initiated) { + CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&(hstorage.env))); + hstorage.initiated = true; + } + return; } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseEnv(void *h, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle)) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseEnv() { + ScopedCuSparseHandleStorage hstorage; + CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(hstorage.env)) + hstorage.initiated = false; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * @@ -359,10 +389,10 @@ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp, - CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( + int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; + cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); @@ -370,32 +400,33 @@ cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMV_bufferSize(handle, modeA, alphap, matA, vecX, betap, vecY, - cTp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( + hstorage.env, modeA, alphap, matA, vecX, betap, vecY, cTp, + CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a, - void *x, void *y, - int32_t ctp, void *buf, +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x, + void *y, int32_t ctp, + void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, alphap, matA, vecX, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(hstorage.env, modeA, alphap, matA, vecX, betap, vecY, cTp, CUSPARSE_SPMV_ALG_DEFAULT, buf)) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -405,15 +436,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -421,16 +453,16 @@ cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(hstorage.env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, buf)) } // TODO: add support to passing alpha and beta as arguments extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -440,15 +472,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -456,8 +489,8 @@ cusparseSpMatDescr_t matC = reinterpret_cast(c); auto cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(hstorage.env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } @@ -487,30 +520,32 @@ static_assert(sizeof(cusparseLtSpMatHandleAndData) == 44104); static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { - // note that cuSparseLt still uses cusparseStatus_t - CUSPARSE_REPORT_IF_ERROR( - cusparseLtInit(reinterpret_cast(h))) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv() { + ScopedCuSparseLtHandleStorage hstorage; + if (!hstorage.initiated) { + hstorage.initiated = true; + // note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&(hstorage.env))); + } } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroySparseLtEnv() { + ScopedCuSparseLtHandleStorage hstorage; + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&(hstorage.env))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dtp, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); +mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, + int32_t dtp, CUstream /*stream*/) { + ScopedCuSparseLtHandleStorage hstorage; + // CusparseLt expects the descriptors to be zero-initialized. memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData)); auto dnmat_handle = reinterpret_cast(dh); auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( - handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, + &(hstorage.env), &(dnmat_handle->mat), rows, cols, /*lda=*/cols, /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) dnmat_handle->values = values; } @@ -530,29 +565,29 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, +mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols, void *values, int32_t dtp, CUstream /*stream*/) { auto spmat_handle = reinterpret_cast(sh); // CusparseLt expects the descriptors to be zero-initialized. memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData)); spmat_handle->values = values; - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, - dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + &(hstorage.env), &(spmat_handle->mat), rows, cols, /*ld=*/cols, + /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW, + CUSPARSELT_SPARSITY_50_PERCENT)) } // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a, - void *b, void *c, int32_t ctp, - CUstream /*stream*/) { +mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, + void *c, int32_t ctp, CUstream /*stream*/) { // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); @@ -565,22 +600,25 @@ cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( - handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat), - &(matC->mat), &(matC->mat), cTp)) + &(hstorage.env), &(matA->matmul), modeA, modeB, &(matA->mat), + &(matB->mat), &(matC->mat), &(matC->mat), cTp)) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( - handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT)) + &(hstorage.env), &(matA->alg_sel), &(matA->matmul), + CUSPARSELT_MATMUL_ALG_DEFAULT)) int alg = 0; CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( - handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + &(hstorage.env), &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( - handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) + &(hstorage.env), &(matA->plan), &(matA->matmul), &(matA->alg_sel))) - CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( + &(hstorage.env), &(matA->plan), &workspace_size_)) CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( - handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_)) + &(hstorage.env), &(matA->plan), &compressed_size_, + &compressed_buffer_size_)) + // avoid zero-alloc *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_); *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_); @@ -589,23 +627,23 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace, +mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace, void *dA_compressed, void *dA_compressedBuffer, CUstream stream) { - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); ALPHABETA(CUDA_R_32F, alpha, beta) CUSPARSE_REPORT_IF_ERROR( - cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values), + cusparseLtSpMMACompress(&(hstorage.env), &(matA->plan), (matA->values), dA_compressed, dA_compressedBuffer, stream)) // TODO: add support to multi-stream execution // Perform the matrix multiplication. D = A*B+C using C==D for now CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed, + cusparseLtMatmul(&(hstorage.env), &(matA->plan), alphap, dA_compressed, matB->values, betap, matC->values, /*dD*/ matC->values, d_workspace, nullptr, 0)) diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -1,6 +1,10 @@ // RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuCreateSparseLtEnv() + llvm.func @mgpuDestroySparseEnv() + llvm.func @mgpuDestroySparseLtEnv() // CHECK-LABEL: func @matmul // CHECK: llvm.call @mgpuStreamCreate @@ -20,15 +24,17 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 + llvm.call @mgpuCreateSparseEnv() : () -> () + llvm.call @mgpuCreateSparseLtEnv() : () -> () + %spmat, %token4 = gpu.create_2to4_spmat async [%token2] %arg0, %arg0, %mem1: memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + llvm.call @mgpuDestroySparseLtEnv() : () -> () + gpu.wait [%token9] return } diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -1,6 +1,8 @@ // RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() // CHECK-LABEL: func @matvec // CHECK: llvm.call @mgpuStreamCreate @@ -20,15 +22,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } @@ -50,15 +52,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } @@ -80,15 +82,15 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + llvm.call @mgpuCreateSparseEnv() : () -> () + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + llvm.call @mgpuDestroySparseEnv() : () -> () + gpu.wait [%token9] return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -5,6 +5,9 @@ // RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s module attributes {gpu.container_module} { + + llvm.func @mgpuCreateSparseEnv() + llvm.func @mgpuDestroySparseEnv() // CHECK-LABEL:func @no_args(%{{.*}}: index) func.func @no_args(%sz : index) { @@ -326,38 +329,36 @@ %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref // CHECK: gpu.alloc async %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - // CHECK: gpu.create_sparse_env async - %env, %token3 = gpu.create_sparse_env async [%token2] + llvm.call @mgpuCreateSparseEnv() : () -> () // CHECK: gpu.create_coo async - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_csr async %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_dn_tensor async - %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref + %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %mem2, %arg0 : index into memref // CHECK: gpu.spmv_buffer_size async - %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64 + %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec into f64 // CHECK: gpu.spmv async - %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 // CHECK: gpu.create_dn_tensor async - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %mem2, %arg0, %arg0 : index, index into memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64 + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64 // CHECK: gpu.spmm async - %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %token11 = gpu.spmm async [%token10] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 // CHECK: gpu.sddmm_buffer_size async - %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64 + %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat into f64 // CHECK: gpu.sddmm async - %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %token13 = gpu.sddmm async [%token12] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 // CHECK: gpu.destroy_dn_tensor async %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat // CHECK: gpu.destroy_sp_mat async %token15 = gpu.destroy_sp_mat async [%token14] %spmat // CHECK: gpu.destroy_dn_tensor async %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec - // CHECK: gpu.destroy_sparse_env async - %token17 = gpu.destroy_sparse_env async [%token16] %env + llvm.call @mgpuDestroySparseEnv() : () -> () // CHECK: gpu.wait - gpu.wait [%token17] + gpu.wait [%token16] return } } diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -6,29 +6,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}} : index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matvec(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } @@ -36,29 +32,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matmul(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } @@ -66,29 +58,25 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @sddmm(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %spmat, %token4 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env - gpu.wait [%token10] + gpu.wait [%token9] return } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -45,19 +45,18 @@ // CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]] // CHECK: %[[VAL_41:.*]] = gpu.wait async -// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index +// CHECK: @mgpuCreateSparseEnv() +// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_41]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref -// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref +// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] // CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]] // CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]] -// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]] -// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref -// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref +// CHECK: @mgpuDestroySparseEnv() +// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_19]] : memref // CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref // CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref // CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -43,18 +43,18 @@ // CHECK: %[[VAL_37:.*]] = gpu.memcpy async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_32]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_15]], %[[VAL_20]], %[[VAL_25]], %[[VAL_31]], %[[VAL_37]]] // CHECK: %[[VAL_38:.*]] = gpu.wait async -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] -// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref -// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref -// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] +// CHECK: @mgpuCreateSparseEnv() +// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_38]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : index into memref +// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : index into memref +// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref -// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref +// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref // CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]] // CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]] // CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]] -// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]] -// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref +// CHECK: @mgpuDestroySparseEnv() +// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_13]] : memref // CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref // CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref // CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -53,18 +53,18 @@ // CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_18]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_10]], %[[VAL_15]], %[[VAL_23]], %[[VAL_28]], %[[VAL_33]]] // CHECK: %[[VAL_34:.*]] = gpu.wait async -// CHECK: %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]] -// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_35]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: @mgpuCreateSparseEnv() +// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_34]]] %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_csr async {{\[}}%[[VAL_40]]] %[[VAL_3]], %[[VAL_3]], %[[VAL_5]], %[[VAL_21]], %[[VAL_26]], %[[VAL_31]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 // CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.alloc async {{\[}}%[[VAL_44]]] (%[[VAL_43]]) : memref -// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 +// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 // CHECK: %[[VAL_48:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_37]] // CHECK: %[[VAL_49:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_48]]] %[[VAL_39]] // CHECK: %[[VAL_50:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_49]]] %[[VAL_41]] -// CHECK: %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] %[[VAL_35]] -// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_51]]] %[[VAL_45]] : memref +// CHECK: @mgpuDestroySparseEnv() +// CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_50]]] %[[VAL_45]] : memref // CHECK: %[[VAL_53:.*]] = gpu.dealloc async {{\[}}%[[VAL_52]]] %[[VAL_8]] : memref<8x8xf64> // CHECK: %[[VAL_54:.*]] = gpu.dealloc async {{\[}}%[[VAL_53]]] %[[VAL_13]] : memref<8x8xf64> // CHECK: %[[VAL_55:.*]] = gpu.dealloc async {{\[}}%[[VAL_54]]] %[[VAL_21]] : memref diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -11,6 +11,9 @@ // RUN: | FileCheck %s module { + llvm.func @mgpuCreateSparseLtEnv() + llvm.func @mgpuDestroySparseLtEnv() + func.func @sampled_matmul(%a : memref<16x32xf16>, %b : memref<32x16xf16>, %c : memref<16x16xf16>) { @@ -28,19 +31,19 @@ %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16> %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16> %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16> - %env, %token7 = gpu.create_sparse_env async [%token6] - %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16> - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16> - %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16> - %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 + llvm.call @mgpuCreateSparseLtEnv() : () -> () + %spmat, %token8 = gpu.create_2to4_spmat async [%token6] %c16, %c32, %d_a: memref<16x32xf16> + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16> + %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16> + %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref - %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 + %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 %token16 = gpu.destroy_sp_mat async [%token15] %spmat %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat - %token18 = gpu.destroy_sparse_env async [%token17] %env - %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> + llvm.call @mgpuDestroySparseLtEnv() : () -> () + %token19 = gpu.memcpy async [%token17] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16> %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16> %token22 = gpu.dealloc async [%token21] %d_a : memref<16x32xf16>