diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1612,18 +1612,20 @@ Example: ```mlir - %dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref + %dvec, %token = gpu.create_dn_vec async [%dep] %env, %mem, %size : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - AnyMemRef:$memref, Index:$size); - let results = (outs Res:$dvec, + GPU_SparseEnvHandle:$env, + AnyMemRef:$memref, + Index:$size); + let results = (outs Res:$dvec, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $memref `,` $size attr-dict `:` type($memref) + $env `,` $memref `,` $size attr-dict `:` type($memref) }]; } @@ -1670,11 +1672,12 @@ Example: ```mlir - %dmat, %token = gpu.create_dn_mat async [%dep] %mem, %size : memref + %dmat, %token = gpu.create_dn_mat async [%dep] %env, %rows, %cols, %mem : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseEnvHandle:$env, Index:$rows, Index:$cols, AnyMemRef:$memref); @@ -1682,7 +1685,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) }]; } @@ -1829,6 +1832,41 @@ }]; } + +def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix with 2:4 sparsity operation"; + let description = [{ + The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense + format with 2:4 sparsity. + The buffers must already be copied from the host to the device prior to + using this operation. The operation returns a handle to the sparse + matrix descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseEnvHandle:$env, + Index:$rows, + Index:$cols, + AnyMemRef:$memref); + let results = (outs Res:$spMat, + Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) + }]; +} + def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> { let summary = "Destroy sparse matrix operation"; let description = [{ @@ -2005,7 +2043,7 @@ Example: ```mlir - %buffersz, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC into f32 + %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 ``` }]; @@ -2017,11 +2055,12 @@ GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, TypeAttr:$computeType); - let results = (outs Res:$bufferSz, + let results = (outs Res]>>:$bufferSzs, Optional:$asyncToken); let builders = [OpBuilder<(ins - "Type":$bufferSz, + "Type":$bufferSzs, "Type":$asyncToken, "ValueRange":$asyncDependencies, "Value":$env, @@ -2031,17 +2070,17 @@ "Type":$computeType), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `into` $computeType + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType }]; } -def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> { +def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "SpMM operation"; let description = [{ The `gpu.spmm` operation performs the SpMM operation on the given sparse and @@ -2060,7 +2099,7 @@ Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffer into f32 + %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 ``` }]; @@ -2072,7 +2111,7 @@ GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, TypeAttr:$computeType, - AnyMemRef:$buffer); + Variadic:$buffers); let results = (outs Optional:$asyncToken); let builders = [OpBuilder<(ins @@ -2083,16 +2122,16 @@ "Value":$dnmatB, "Value":$dnmatC, "Type":$computeType, - "Value":$buffer), [{ + "ValueRange":$buffers), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, - modeB, spmatA, dnmatB, dnmatC, computeType, buffer);}]> + modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -99,6 +99,10 @@ Type llvmInt8Type = IntegerType::get(context, 8); Type llvmInt32Type = IntegerType::get(context, 32); Type llvmInt64Type = IntegerType::get(context, 64); + Type llvmInt8PointerType = + this->getTypeConverter()->getPointerType(llvmInt8Type); + Type llvmInt64PointerType = + this->getTypeConverter()->getPointerType(llvmInt64Type); Type llvmIntPtrType = IntegerType::get( context, this->getTypeConverter()->getPointerBitwidth(0)); @@ -275,6 +279,49 @@ {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = { + "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = { + "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = { + "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}}; + FunctionCallBuilder createSparseLtEnvCallBuilder = { + "mgpuCreateSparseLtEnv", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroySparseLtEnvCallBuilder = { + "mgpuDestroySparseLtEnv", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder createLtDnMatCallBuilder = { + "mgpuCreateCuSparseLtDnMat", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroyCuSparseLtSpMatBuilder = { + "mgpuDestroyCuSparseLtSpMat", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroyCuSparseLtDnMatBuilder = { + "mgpuDestroyCuSparseLtDnMat", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder create2To4SpMatCallBuilder = { + "mgpuCusparseLtCreate2To4SpMat", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = { + "mgpuCuSparseLtSpMMBufferSize", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType /*void *stream*/}}; + FunctionCallBuilder cuSparseLtSpmmBuilder = { + "mgpuCuSparseLtSpMM", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType /*void *stream*/}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -577,6 +624,20 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::Create2To4SpMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + class ConvertDestroySpMatOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -715,6 +776,15 @@ // TODO: add support to CUSPARSE_INDEX_16U: 1 } +static int32_t getCuSparseLtDataTypeFrom(Type type) { + if (type.isF16()) + return 0; // CUSPARSE_COMPUTE_16F, + if (type.isInteger(32)) + return 1; // CUSPARSE_COMPUTE_32I + llvm_unreachable("unsupported type"); + // TODO: add support to TF32 +} + // Corresponding to cudaDataType_t defined in CUDA library_types.h. static int32_t getCuSparseDataTypeFrom(Type type) { if (llvm::isa(type)) { @@ -753,6 +823,39 @@ llvm_unreachable("unsupported element type"); } +// TODO: We may want a run-time (of the mlir compiler) disablement/warning: +// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a +// runtime (of the CUDA program) error , but it might be great if we could at +// least output a warning when we found the target architecture is <8.0 and the +// user still wants to use cusparseLt. to make sure when lowering gpu sparse +// dialect to llvm calls, the cusparselt calls are disabled for cuda +// architecture <8.0 +static bool is2To4Sparsity(Value spMat) { + if (auto op = spMat.getDefiningOp()) + return true; + if (auto op = spMat.getDefiningOp()) + return false; + if (auto op = spMat.getDefiningOp()) + return false; + if (auto op = spMat.getDefiningOp()) + return false; + // Print the spMat defining op + spMat.getDefiningOp()->print(llvm::errs()); + llvm_unreachable("cannot find spmat def"); +} + +static bool isSpMMCusparseLtOp(Value op) { + for (Operation *user : op.getUsers()) { + auto spmmOp = dyn_cast(user); + // If the other operator is 50% sparsity then we should use cusparseLt + if (!spmmOp) + continue; + if (is2To4Sparsity(spmmOp.getSpmatA())) + return true; + } + return false; +} + // Returns whether all operands are of LLVM type. static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter) { @@ -1314,8 +1417,23 @@ return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - auto handle = - createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); + // Use the cusparseLt create call if the dnmat is used with spmat with + // 2:4 sparsity + Value handle; + if (isSpMMCusparseLtOp(op.getEnv())) { + // Assert the size is 11024 bytes + AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {}); + auto handleSz = rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(11024)); + handle = rewriter.create(loc, llvmInt8PointerType, + llvmInt8Type, handleSz); + handle = rewriter.create(loc, llvmPointerType, handle); + createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream}) + .getResult(); + } else { + handle = + createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); + } rewriter.replaceOp(op, {handle, stream}); return success(); } @@ -1328,7 +1446,15 @@ return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream}); + // Use the cusparseLt destroy call if the dnmat is used with spmat with + // 2:4 sparsity + if (isSpMMCusparseLtOp(op.getEnv())) { + destroySparseLtEnvCallBuilder.create(loc, rewriter, + {adaptor.getEnv(), stream}); + } else { + destroySparseEnvCallBuilder.create(loc, rewriter, + {adaptor.getEnv(), stream}); + } rewriter.replaceOp(op, {stream}); return success(); } @@ -1382,11 +1508,34 @@ pMat = rewriter.create(loc, llvmPointerType, pMat); Type dType = op.getMemref().getType().getElementType(); auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); - auto handle = - createDnMatCallBuilder - .create(loc, rewriter, - {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) - .getResult(); + // TODO: For now, we track the use of the handle and lower it to cusparse / + // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are + // used, we require two separate Creation ops to be the correct logic. In + // future, we may add support to using one handle in sparse tensor / GPU + // dialect in both cusparse and cusparseLt. use the cusparseLt create call if + // the dnmat is used with spmat with 2:4 sparsity + Value handle; + if (isSpMMCusparseLtOp(op.getDmat())) { + auto envHandle = adaptor.getEnv(); + AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); + auto handleSz = rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(11032)); + handle = rewriter.create(loc, llvmInt8PointerType, + llvmInt8Type, handleSz); + handle = rewriter.create(loc, llvmPointerType, handle); + + createLtDnMatCallBuilder + .create(loc, rewriter, + {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, + dtp, stream}) + .getResult(); + } else { + handle = + createDnMatCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) + .getResult(); + } rewriter.replaceOp(op, {handle, stream}); return success(); } @@ -1399,7 +1548,14 @@ return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream}); + // Use the cusparseLt destroy call if the dnmat is used with spmat with + // 2:4 sparsity + if (isSpMMCusparseLtOp(op.getDmat())) { + destroyCuSparseLtDnMatBuilder.create(loc, rewriter, + {adaptor.getDmat(), stream}); + } else { + destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream}); + } rewriter.replaceOp(op, {stream}); return success(); } @@ -1454,8 +1610,7 @@ pIdxs = rewriter.create(loc, llvmPointerType, pIdxs); pValues = rewriter.create(loc, llvmPointerType, pValues); } - Type iType = - llvm::cast(op.getIdxs().getType()).getElementType(); + Type iType = llvm::cast(op.getIdxs().getType()).getElementType(); Type dType = llvm::cast(op.getValues().getType()).getElementType(); auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType)); @@ -1508,6 +1663,39 @@ return success(); } +LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::Create2To4SpMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pMat = + MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pMat = rewriter.create(loc, llvmPointerType, pMat); + Type dType = + llvm::cast(op.getMemref().getType()).getElementType(); + auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType)); + auto envHandle = adaptor.getEnv(); + + AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {}); + auto handleSz = rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(44104)); + Value handle = rewriter.create(loc, llvmInt8PointerType, + llvmInt8Type, handleSz); + handle = rewriter.create(loc, llvmPointerType, handle); + + create2To4SpMatCallBuilder + .create(loc, rewriter, + {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, + dtp, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DestroySpMatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1516,7 +1704,14 @@ return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream}); + // Use the cusparseLt destroy call if the spmat is 2:4 sparsity + if (is2To4Sparsity(op.getSpmat())) { + destroyCuSparseLtSpMatBuilder.create(loc, rewriter, + {adaptor.getSpmat(), stream}); + + } else { + destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream}); + } rewriter.replaceOp(op, {stream}); return success(); } @@ -1577,14 +1772,29 @@ auto stream = adaptor.getAsyncDependencies().front(); auto computeType = genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); - - auto bufferSize = spMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), computeType, stream}) - .getResult(); - rewriter.replaceOp(op, {bufferSize, stream}); + Value bufferSize; + if (is2To4Sparsity(op.getSpmatA())) { + auto three = rewriter.create(loc, getIndexType(), + rewriter.getIndexAttr(3)); + bufferSize = rewriter.create(loc, llvmInt64PointerType, + llvmInt64Type, three); + bufferSize = + rewriter.create(loc, llvmPointerType, bufferSize); + + cuSparseLtSpmmBufferSizeBuilder + .create(loc, rewriter, + {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream}) + .getResult(); + rewriter.replaceOp(op, {bufferSize, stream}); + } else { + bufferSize = spMMBufferSizeCallBuilder + .create(loc, rewriter, + {adaptor.getEnv(), modeA, modeB, + adaptor.getSpmatA(), adaptor.getDnmatB(), + adaptor.getDnmatC(), computeType, stream}) + .getResult(); + rewriter.replaceOp(op, {bufferSize, stream}); + } return success(); } @@ -1623,14 +1833,31 @@ genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); auto stream = adaptor.getAsyncDependencies().front(); - Value pBuf = - MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); - if (!getTypeConverter()->useOpaquePointers()) - pBuf = rewriter.create(loc, llvmPointerType, pBuf); - spMMCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, - pBuf, stream}); + + // Lower to cusparseLt if applicable + if (is2To4Sparsity(op.getSpmatA())) { + SmallVector pBufs; + for (Value buffer : adaptor.getBuffers()) { + Value pBuf = MemRefDescriptor(buffer).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pBuf = rewriter.create(loc, llvmPointerType, pBuf); + pBufs.push_back(pBuf); + } + cuSparseLtSpmmBuilder.create(loc, rewriter, + {adaptor.getEnv(), adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, pBufs[0], pBufs[1], pBufs[2], + stream}); + } else { + Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) + .allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pBuf = rewriter.create(loc, llvmPointerType, pBuf); + spMMCallBuilder.create(loc, rewriter, + {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, pBuf, stream}); + } rewriter.replaceOp(op, {stream}); return success(); } @@ -1696,6 +1923,7 @@ ConvertCreateCooOpToGpuRuntimeCallPattern, ConvertCreateCooAoSOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, + ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern, ConvertDestroySpMatOpToGpuRuntimeCallPattern, ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern, ConvertSpMVOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -464,11 +464,11 @@ Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dvecX = rewriter.create(loc, dnVecHandleTp, tokenTp, - token, vecX, szX); + token, handle, vecX, szX); Value dnX = dvecX.getResult(0); token = dvecX.getAsyncToken(); auto dvecY = rewriter.create(loc, dnVecHandleTp, tokenTp, - token, vecY, szY); + token, handle, vecY, szY); Value dnY = dvecY.getResult(0); token = dvecY.getAsyncToken(); @@ -570,12 +570,12 @@ rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); - auto dmatB = rewriter.create(loc, dnMatHandleTp, tokenTp, - token, szk, szn, matB); + auto dmatB = rewriter.create( + loc, dnMatHandleTp, tokenTp, token, handle, szk, szn, matB); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); - auto dmatC = rewriter.create(loc, dnMatHandleTp, tokenTp, - token, szm, szn, matC); + auto dmatC = rewriter.create( + loc, dnMatHandleTp, tokenTp, token, handle, szm, szn, matC); Value dnC = dmatC.getResult(0); token = dmatC.getAsyncToken(); diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -191,8 +191,8 @@ # We need the libcuda.so library. find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) - # We need the libcusparse.so library. - find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) + + add_mlir_library(mlir_cuda_runtime SHARED @@ -201,6 +201,7 @@ EXCLUDE_FROM_LIBMLIR ) set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14) + target_include_directories(mlir_cuda_runtime PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} @@ -208,8 +209,33 @@ target_link_libraries(mlir_cuda_runtime PRIVATE ${CUDA_RUNTIME_LIBRARY} - ${CUDA_CUSPARSE_LIBRARY} ) + + if(MLIR_ENABLE_CUDA_CUSPARSE) + + # Find the libcusparse.so library if CUSPARSE build is requested. + find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_CUSPARSE_LIBRARY} + ) + + if(MLIR_ENABLE_CUDA_CUSPARSELT) + # Find the libcusparseLt.so library in package manager default path if + # CUSPARSELT build is requested. libcusparseLt.so provides sm80+ tensor + # core support for 2:4 sparsity acceleration. + find_library(CUDA_CUSPARSELT_LIBRARY cusparseLt HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) + find_path(CUDA_CUSPARSELT_HEADER cusparseLt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} REQUIRED) + target_include_directories(mlir_cuda_runtime + PRIVATE + ${CUDA_CUSPARSELT_HEADER} + ) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_CUSPARSELT_LIBRARY} + ) + endif() + endif() endif() if(MLIR_ENABLE_ROCM_RUNNER) diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -19,7 +19,13 @@ #include "cuda.h" #include "cuda_bf16.h" #include "cuda_fp16.h" + +#if MLIR_ENABLE_CUDA_CUSPARSE #include "cusparse.h" +#if MLIR_ENABLE_CUDA_CUSPARSELT +#include "cusparseLt.h" +#endif // MLIR_ENABLE_CUDA_CUSPARSELT +#endif // MLIR_ENABLE_CUDA_CUSPARSE #ifdef _WIN32 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) @@ -226,6 +232,8 @@ defaultDevice = device; } +#if MLIR_ENABLE_CUDA_CUSPARSE + /// /// Wrapper methods for the cuSparse library. /// @@ -445,3 +453,162 @@ matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } + +#if MLIR_ENABLE_CUDA_CUSPARSELT + +/// +/// Wrapper methods for the cuSparseLt library. +/// + +struct cusparseLtSpMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *values{nullptr}; + // TODO: the following is associated with the SpMM operator rather than the + // sparse matrix. Create workspace buffers and pass them to the SpMM + // execution. + cusparseLtMatmulAlgSelection_t alg_sel; + cusparseLtMatmulPlan_t plan; + cusparseLtMatmulDescriptor_t matmul; +}; + +struct cusparseLtDnMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *values{nullptr}; +}; + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() { + assert(sizeof(cusparseLtHandle_t) == 11024); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtSpMatHandleSize() { + return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() { + return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { + // note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR( + cusparseLtInit(reinterpret_cast(h))) + return; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { + auto handle = reinterpret_cast(h); + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, + void *values, int32_t dw, CUstream /*stream*/) { + cusparseLtMatDescriptor_t mat; + auto handle = reinterpret_cast(h); + auto dnmat_handle = reinterpret_cast(dh); + cudaDataType_t dtp = dataTp(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( + handle, &(dh->mat), rows, cols, /*lda=*/cols, + /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW)) + dnmat_handle->values = values; +} + +// This can be used to destroy both dense matrices and sparse matrices in +// cusparseLt +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { + auto matAndData = reinterpret_cast(m); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) { + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, + void *values, int32_t dw, CUstream /*stream*/) { + auto spmat_handle = reinterpret_cast(sh); + spmat_handle->values = values; + auto handle = reinterpret_cast(h); + cudaDataType_t dtp = dataTp_cusparseLt(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( + handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp, + CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) +} + +// Several things are being done in this stage, algorithm selection, planning, +// and returning workspace and compressed matrices data buffer sizes. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size, + void *compressed_buffer_size, void *h, void *a, + CUstream /*stream*/) { + // TODO: support more advanced settings, e.g., the input right operand is a + // sparse matrix assuming matA is the sparse matrix + auto handle = reinterpret_cast(h); + auto matA = reinterpret_cast(a); + + CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit( + handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) + int alg = 0; + CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( + handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + sizeof(alg))) + // TODO: add transpose support + CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( + handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat), + &matB, &matC, &matC, compute_type)) + CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul, + &(matWithData.alg_sel))) + + CHECK_CUSPARSE( + cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size)) + CHECK_CUSPARSE(cusparseLtSpMMACompressedSize( + handle, &(matA.plan), compressed_size, compressed_buffer_size)) + + // avoid zero-alloc + *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size); + *compressed_size = (*compressed_size == 0 ? 1 : *compressed_size); + *compressed_buffer_size = + (*compressed_buffer_size == 0 ? 1 : *compressed_buffer_size); + return; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a, + void *b, void *c, int32_t dw, void *buf, void *dA_compressed, + void *dA_compressedBuffer, CUstream stream) { + auto handle = reinterpret_cast(h); + auto matA = reinterpret_cast(a); + auto matB = reinterpret_cast(b); + auto matC = reinterpret_cast(c); + + cusparseLtMatmulAlgSelection_t alg_sel; + cusparseLtMatmulPlan_t plan; + cusparseLtMatmulDescriptor_t matmul; + + ALPHABETA(dw, alpha, beta) + + CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values), + dA_compressed, dA_compressedBuffer, + stream)) + + // TODO: add support to multi-stream execution + // Perform the matrix multiplication. D = A*B+C using C==D for now + CHECK_CUSPARSE( + cusparseLtMatmul(handle, reinterpret_cast(plan), + &alpha, dA_compressed, dB, &beta, matC->values, + /*dD*/ matC->values, d_workspace, &stream, 1)) + + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + // destroy the plan associated with the sparse matrix + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan))) +} + +#endif // MLIR_ENABLE_CUDA_CUSPARSELT +#endif // MLIR_ENABLE_CUDA_CUSPARSE diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -0,0 +1,35 @@ +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s + +module attributes {gpu.container_module} { + + // CHECK-LABEL: func @matmul + // CHECK: llvm.call @mgpuStreamCreate + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuCreateSparseLtEnv + // CHECK: llvm.call @mgpuCusparseLtCreate2To4SpMat + // CHECK: llvm.call @mgpuCreateCuSparseLtDnMat + // CHECK: llvm.call @mgpuCuSparseLtSpMMBufferSize + // CHECK: llvm.call @mgpuCuSparseLtSpMM + // CHECK: llvm.call @mgpuDestroyCuSparseLtSpMat + // CHECK: llvm.call @mgpuDestroyCuSparseLtDnMat + // CHECK: llvm.call @mgpuDestroySparseLtEnv + // CHECK: llvm.call @mgpuStreamSynchronize + // CHECK: llvm.call @mgpuStreamDestroy + func.func @matmul(%arg0: index) { + %token0 = gpu.wait async + %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref + %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref + %env, %token3 = gpu.create_sparse_env async [%token2] + %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple into f16 + %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 + %token8 = gpu.destroy_sp_mat async [%token7] %spmat + %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token10 = gpu.destroy_sparse_env async [%token9] %env + gpu.wait [%token10] + return + } + +} diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -22,7 +22,7 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref + %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat @@ -52,8 +52,8 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat into f64 + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_mat async [%token8] %dnmat @@ -82,7 +82,7 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -333,15 +333,15 @@ // CHECK: gpu.create_csr async %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_dn_vec async - %dnvec, %token6 = gpu.create_dn_vec async [%token5] %mem2, %arg0 : memref + %dnvec, %token6 = gpu.create_dn_vec async [%token5] %env, %mem2, %arg0 : memref // CHECK: gpu.spmv_buffer_size async %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64 // CHECK: gpu.spmv async %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 // CHECK: gpu.create_dn_mat async - %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref + %dnmat, %token9 = gpu.create_dn_mat async [%token8] %env, %arg0, %arg0, %mem2 : memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat into f64 + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64 // CHECK: gpu.spmm async %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 // CHECK: gpu.sddmm_buffer_size async diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -8,7 +8,7 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} @@ -22,7 +22,7 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref + %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat @@ -38,7 +38,7 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} @@ -52,8 +52,8 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat into f64 + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_mat async [%token8] %dnmat @@ -68,7 +68,7 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} @@ -82,7 +82,7 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -47,9 +47,9 @@ // CHECK: %[[VAL_41:.*]] = gpu.wait async // CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] // CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref // CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -45,8 +45,8 @@ // CHECK: %[[VAL_38:.*]] = gpu.wait async // CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : memref -// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : memref +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : memref +// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : memref // CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref // CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7880,6 +7880,10 @@ srcs = ["lib/ExecutionEngine/CudaRuntimeWrappers.cpp"], # Prevent needing EnableABIBreakingChecks symbol from LLVMSupport. copts = ["-DLLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING=1"], + # Here: + # MLIR_ENABLE_CUDA_CUSPARSE : enables cuSPARSE + # MLIR_ENABLE_CUDA_CUSPARSELT : enables cuSPARSElt + local_defines = ["MLIR_ENABLE_CUDA_CUSPARSE"], tags = [ "manual", # External dependency "nobuildkite", # TODO(gcmn): Add support for this target