diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -1376,8 +1376,8 @@ if (isSpMMCusparseLtOp(op.getDnTensor())) { auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); - handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); + handle = rewriter.create( + loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16); handle = rewriter.create(loc, llvmPointerType, handle); createLtDnMatCallBuilder @@ -1554,8 +1554,8 @@ // CUDA runner asserts the size is 44104 bytes. auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(44104)); - Value handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); + Value handle = rewriter.create( + loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16); handle = rewriter.create(loc, llvmPointerType, handle); create2To4SpMatCallBuilder @@ -1644,8 +1644,8 @@ rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); auto three = rewriter.create(loc, getIndexType(), rewriter.getIndexAttr(3)); - auto bufferSize = rewriter.create(loc, llvmInt64PointerType, - llvmInt64Type, three); + auto bufferSize = rewriter.create( + loc, llvmInt64PointerType, llvmInt64Type, three, /*alignment=*/16); createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, {bufferSize, modeA, modeB, adaptor.getSpmatA(), diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -512,7 +512,7 @@ ScopedContext scopedContext; assert(!cusparseLt_initiated && "client called mgpuCreateSparseLtEnv() twice"); - // Note that cuSparseLt still uses cusparseStatus_t + // Note that cuSparseLt still uses cusparseStatus_t. CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&cusparseLt_env)); cusparseLt_initiated = true; } @@ -527,29 +527,22 @@ mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, int32_t dtp, CUstream /*stream*/) { assert(cusparseLt_initiated && "client did not call mgpuCreateSparseLtEnv()"); - // CusparseLt expects the descriptors to be zero-initialized. - memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData)); auto dnmat_handle = reinterpret_cast(dh); + // CusparseLt expects the descriptors to be zero-initialized. + memset(dnmat_handle, 0, sizeof(cusparseLtDnMatHandleAndData)); + dnmat_handle->values = values; auto dTp = static_cast(dtp); - // assuming row-major when deciding lda + // Assume row-major when deciding lda. + const uint32_t alignment = 16; CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( &cusparseLt_env, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, - /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) - dnmat_handle->values = values; -} - -// This can be used to destroy both dense matrices and sparse matrices in -// cusparseLt -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) + alignment, dTp, CUSPARSE_ORDER_ROW)) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) +mgpuDestroyCuSparseLtDnMat(void *dh, CUstream /*stream*/) { + auto dnmat_handle = reinterpret_cast(dh); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(dnmat_handle->mat))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void @@ -561,11 +554,17 @@ memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData)); spmat_handle->values = values; auto dTp = static_cast(dtp); - // assuming row-major when deciding lda + // Assume row-major when deciding lda. + const uint32_t alignment = 16; CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, - /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW, - CUSPARSELT_SPARSITY_50_PERCENT)) + &cusparseLt_env, &(spmat_handle->mat), rows, cols, /*ld=*/cols, alignment, + dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyCuSparseLtSpMat(void *sh, CUstream /*stream*/) { + auto spmat_handle = reinterpret_cast(sh); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(spmat_handle->mat))) } // Several things are being done in this stage, algorithm selection, planning, @@ -607,7 +606,7 @@ &cusparseLt_env, &(matA->plan), &compressed_size_, &compressed_buffer_size_)) - // avoid zero-alloc + // Avoid zero-allocation. *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_); *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_); *compressed_buffer_size =