diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -128,6 +128,13 @@ "dense matrix handle type">, BuildableType<"mlir::gpu::SparseDnMatHandleType::get($_builder.getContext())">; +// TODO: remove the above two +def GPU_SparseDnTensorHandle : + DialectType($_self)">, + "dense tensor handle type">, + BuildableType<"mlir::gpu::SparseDnTensorHandleType::get($_builder.getContext())">; + def GPU_SparseSpMatHandle : DialectType($_self)">, diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -164,8 +164,9 @@ // Adds a `gpu.async.token` to the front of the argument list. void addAsyncDependency(Operation *op, Value token); +// TODO: remove the two enums // Handle types for sparse. -enum class SparseHandleKind { Env, DnVec, DnMat, SpMat }; +enum class SparseHandleKind { Env, DnVec, DnMat, SpMat, DnTensor }; template class SparseHandleType @@ -179,6 +180,8 @@ using SparseEnvHandleType = SparseHandleType; using SparseDnVecHandleType = SparseHandleType; using SparseDnMatHandleType = SparseHandleType; +// remove the above two +using SparseDnTensorHandleType = SparseHandleType; using SparseSpMatHandleType = SparseHandleType; } // namespace gpu diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1717,6 +1717,65 @@ }]; } +def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { + let summary = "Create dense tensor operation"; + let description = [{ + The `gpu.create_dn_tensor` operation initializes a dense tensor from + the given values buffer and sizes. The buffer must already be copied + from the host to the device prior to using this operation. The + operation returns a handle to the dense tensor descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %dims, %mem : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseEnvHandle:$env, + Variadic:$dims, + AnyMemRef:$memref); + let results = (outs Res:$dnTensor, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $dims `,` $memref attr-dict `:` type($dims) `,` type($memref) + }]; +} + +def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> { + let summary = "Destroy dense tensor operation"; + let description = [{ + The `gpu.destroy_dn_tensor` operation releases all resources of a dense + tensor represented by a handle that was previously created by a + `gpu.create_dn_tensor` operation. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.destroy_dn_tensor async [%dep] %dnTensor + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$dnTensor); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $dnTensor attr-dict + }]; +} + def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> { let summary = "Create sparse matrix in COO format operation"; let description = [{ diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -548,6 +548,35 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertCreateDnTensorOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateDnTensorOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) { + } + +private: + LogicalResult + matchAndRewrite(gpu::CreateDnTensorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertDestroyDnTensorOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertDestroyDnTensorOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::DestroyDnTensorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +// TODO: remove the following two class ConvertCreateDnVecOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -1509,6 +1538,100 @@ return success(); } +LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateDnTensorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pTensor = + MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pTensor = rewriter.create(loc, llvmPointerType, pTensor); + Type dType = op.getMemref().getType().getElementType(); + auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); + + SmallVector dims; + for (Value dim : adaptor.getDims()) { + // dims.push_back(rewriter.create( + // loc, getIndexType(), rewriter.create( + // loc, llvmInt64Type, dim))); + dims.push_back(dim); + } + + Value handle; + // TODO: For now, we track the use of the handle and lower it to cusparse / + // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are + // used, we require two separate Creation ops to be the correct logic. In + // future, we may add support to using one handle in sparse tensor / GPU + // dialect in both cusparse and cusparseLt. use the cusparseLt create call if + // the dnmat is used with spmat with 2:4 sparsity + if (dims.size() == 2) { + if (isSpMMCusparseLtOp(op.getDnTensor())) { + auto envHandle = adaptor.getEnv(); + AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); + auto handleSz = rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(11032)); + handle = rewriter.create(loc, llvmInt8PointerType, + llvmInt8Type, handleSz); + handle = rewriter.create(loc, llvmPointerType, handle); + + createLtDnMatCallBuilder + .create(loc, rewriter, + {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream}) + .getResult(); + } else { + handle = + createDnMatCallBuilder + .create(loc, rewriter, {dims[0], dims[1], pTensor, dtp, stream}) + .getResult(); + } + } else { + assert(dims.size() == 1 && "Only 1D and 2D tensors are supported"); + handle = createDnVecCallBuilder + .create(loc, rewriter, {dims[0], pTensor, dtp, stream}) + .getResult(); + } + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertDestroyDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroyDnTensorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + auto definingOp = op.getDnTensor().getDefiningOp(); + SmallVector dims; + for (Value dim : definingOp.getDims()) { + // dims.push_back(rewriter.create( + // loc, getIndexType(), rewriter.create( + // loc, llvmInt64Type, dim))); + dims.push_back(dim); + } + if (dims.size() == 2) { + // Use the cusparseLt destroy call if the dnmat is used with spmat with + // 2:4 sparsity + if (isSpMMCusparseLtOp(op.getDnTensor())) { + destroyCuSparseLtDnMatBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); + } else { + destroyDnMatCallBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); + } + } else { + destroyDnVecCallBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); + } + rewriter.replaceOp(op, {stream}); + return success(); +} + LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateDnMatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1916,6 +2039,8 @@ addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); + // TODO: remove the above two + addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); @@ -1935,6 +2060,9 @@ ConvertDestroyDnVecOpToGpuRuntimeCallPattern, ConvertCreateDnMatOpToGpuRuntimeCallPattern, ConvertDestroyDnMatOpToGpuRuntimeCallPattern, + // TODO: remove the above four + ConvertCreateDnTensorOpToGpuRuntimeCallPattern, + ConvertDestroyDnTensorOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, ConvertCreateCooAoSOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -149,6 +149,8 @@ addTypes(); addTypes(); addTypes(); + // TODO: remove the above two + addTypes(); addTypes(); addOperations< #define GET_OP_LIST @@ -169,6 +171,9 @@ return "sparse.dnvec_handle"; case SparseHandleKind::DnMat: return "sparse.dnmat_handle"; + // remove thw above two + case SparseHandleKind::DnTensor: + return "sparse.dntensor_handle"; case SparseHandleKind::SpMat: return "sparse.spmat_handle"; } @@ -225,6 +230,9 @@ return SparseDnVecHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::DnMat)) return SparseDnMatHandleType::get(context); + // TODO: remove the above two + if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor)) + return SparseDnTensorHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat)) return SparseSpMatHandleType::get(context); @@ -242,6 +250,10 @@ [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnVec); }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnMat); }) + // TODO: remove the above two + .Case([&](Type) { + os << getSparseHandleKeyword(SparseHandleKind::DnTensor); + }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); }) .Case([&](MMAMatrixType fragTy) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -557,6 +557,7 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); Type envHandleTp = rewriter.getType(); + // TODO: replace with DnTensor Type dnMatHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType();