diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1963,7 +1963,7 @@ }]; } -def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> { +def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface, AttrSizedResultSegments]> { let summary = "Precompute buffersize for SpMM operation"; let description = [{ The `gpu.spmm_buffer_size` operation returns the buffer size required @@ -1994,8 +1994,7 @@ GPU_SparseDnTensorHandle:$dnmatB, GPU_SparseDnTensorHandle:$dnmatC, TypeAttr:$computeType); - let results = (outs Res]>>:$bufferSzs, + let results = (outs Variadic:$bufferSzs, Optional:$asyncToken); let builders = [OpBuilder<(ins diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -1746,18 +1746,31 @@ rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); auto three = rewriter.create(loc, getIndexType(), rewriter.getIndexAttr(3)); - bufferSize = rewriter.create(loc, llvmInt64PointerType, - llvmInt64Type, three); - bufferSize = - rewriter.create(loc, llvmPointerType, bufferSize); - + auto bufferSize = rewriter.create(loc, llvmInt64PointerType, + llvmInt64Type, three); createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, {bufferSize, adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream}) .getResult(); - rewriter.replaceOp(op, {bufferSize, stream}); + + auto bufferSizePtr1 = rewriter.create( + loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize, + ValueRange{rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(1))}); + auto bufferSizePtr2 = rewriter.create( + loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize, + ValueRange{rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(2))}); + auto bufferSize0 = + rewriter.create(loc, llvmInt64Type, bufferSize); + auto bufferSize1 = + rewriter.create(loc, llvmInt64Type, bufferSizePtr1); + auto bufferSize2 = + rewriter.create(loc, llvmInt64Type, bufferSizePtr2); + + rewriter.replaceOp(op, {bufferSize0, bufferSize1, bufferSize2, stream}); } else { auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -556,9 +556,10 @@ auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); - auto workspace_size = reinterpret_cast(bs); - auto compressed_size = &(reinterpret_cast(bs)[1]); - auto compressed_buffer_size = &(reinterpret_cast(bs)[2]); + auto workspace_size = reinterpret_cast(bs); + auto compressed_size = &(reinterpret_cast(bs)[1]); + auto compressed_buffer_size = &(reinterpret_cast(bs)[2]); + size_t workspace_size_, compressed_size_, compressed_buffer_size_; auto cTp = static_cast(ctp); cusparseOperation_t modeA = static_cast(ma); @@ -577,15 +578,14 @@ handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size)) + cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_)) CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( - handle, &(matA->plan), compressed_size, compressed_buffer_size)) - + handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_)) // avoid zero-alloc - *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size); - *compressed_size = (*compressed_size == 0 ? 1 : *compressed_size); + *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_); + *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_); *compressed_buffer_size = - (*compressed_buffer_size == 0 ? 1 : *compressed_buffer_size); + (compressed_buffer_size_ == 0 ? 1 : compressed_buffer_size_); } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -23,7 +23,7 @@ %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple into f16 + %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -28,18 +28,14 @@ %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16> %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16> %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16> - // Allocating larger memory than enough for workspace and storing compressed - // matrices as we haven't implemented the op to unpack tuple %bufferSzs to - // retrieve these three sizes. - // TODO: implement the op to unpack tuple %bufferSzs. - %mem1, %token7 = gpu.alloc async [%token6] (%c1048576) : memref - %mem2, %token8 = gpu.alloc async [%token7] (%c1048576) : memref - %mem3, %token9 = gpu.alloc async [%token8] (%c1048576) : memref - %env, %token10 = gpu.create_sparse_env async [%token9] - %spmat, %token11 = gpu.create_2to4_spmat async [%token10] %env, %c16, %c32, %d_a: memref<16x32xf16> - %dnmat, %token12 = gpu.create_dn_tensor async [%token11] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16> - %dnmat2, %token13 = gpu.create_dn_tensor async [%token12] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16> - %bufferSzs, %token14 = gpu.spmm_buffer_size async [%token13] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : tuple into f16 + %env, %token7 = gpu.create_sparse_env async [%token6] + %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16> + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16> + %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16> + %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 + %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref + %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref + %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 %token16 = gpu.destroy_sp_mat async [%token15] %spmat %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat