diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -116,17 +116,11 @@ "sparse environment handle type">, BuildableType<"mlir::gpu::SparseEnvHandleType::get($_builder.getContext())">; -def GPU_SparseDnVecHandle : +def GPU_SparseDnTensorHandle : DialectType($_self)">, - "dense vector handle type">, - BuildableType<"mlir::gpu::SparseDnVecHandleType::get($_builder.getContext())">; - -def GPU_SparseDnMatHandle : - DialectType($_self)">, - "dense matrix handle type">, - BuildableType<"mlir::gpu::SparseDnMatHandleType::get($_builder.getContext())">; + CPred<"llvm::isa<::mlir::gpu::SparseDnTensorHandleType>($_self)">, + "dense tensor handle type">, + BuildableType<"mlir::gpu::SparseDnTensorHandleType::get($_builder.getContext())">; def GPU_SparseSpMatHandle : DialectType class SparseHandleType @@ -177,8 +177,7 @@ }; using SparseEnvHandleType = SparseHandleType; -using SparseDnVecHandleType = SparseHandleType; -using SparseDnMatHandleType = SparseHandleType; +using SparseDnTensorHandleType = SparseHandleType; using SparseSpMatHandleType = SparseHandleType; } // namespace gpu diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1597,73 +1597,13 @@ }]; } -def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> { - let summary = "Create dense vector operation"; +def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { + let summary = "Create dense tensor operation"; let description = [{ - The `gpu.create_dn_vec` operation initializes a dense vector from - the given values buffer and size. The buffer must already be copied - from the host to the device prior to using this operation. The - operation returns a handle to the dense vector descriptor. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %dvec, %token = gpu.create_dn_vec async [%dep] %env, %mem, %size : memref - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, - AnyMemRef:$memref, - Index:$size); - let results = (outs Res:$dvec, - Optional:$asyncToken); - - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) - $env `,` $memref `,` $size attr-dict `:` type($memref) - }]; -} - -def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> { - let summary = "Destroy dense vector operation"; - let description = [{ - The `gpu.destroy_dn_vec` operation releases all resources of a dense - vector represented by a handle that was previously created by a - `gpu.create_dn_vec` operation. - - If the `async` keyword is present, the op is executed asynchronously (i.e. - it does not block until the execution has finished on the device). In - that case, it returns a !gpu.async.token in addition to the environment. - - Example: - - ```mlir - %token = gpu.destroy_dn_vec async [%dep] %dvec - ``` - }]; - - let arguments = (ins Variadic:$asyncDependencies, - Arg:$dvec); - let results = (outs Optional:$asyncToken); - - let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) - $dvec attr-dict - }]; -} - -def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> { - let summary = "Create dense matrix operation"; - let description = [{ - The `gpu.create_dn_mat` operation initializes a dense matrix from + The `gpu.create_dn_tensor` operation initializes a dense tensor from the given values buffer and sizes. The buffer must already be copied from the host to the device prior to using this operation. The - operation returns a handle to the dense matrix descriptor. + operation returns a handle to the dense tensor descriptor. If the `async` keyword is present, the op is executed asynchronously (i.e. it does not block until the execution has finished on the device). In @@ -1672,29 +1612,28 @@ Example: ```mlir - %dmat, %token = gpu.create_dn_mat async [%dep] %env, %rows, %cols, %mem : memref + %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, - Index:$rows, - Index:$cols, - AnyMemRef:$memref); - let results = (outs Res:$dmat, Optional:$asyncToken); + AnyMemRef:$memref, + Variadic:$dims); + let results = (outs Res:$dnTensor, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) }]; } -def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> { - let summary = "Destroy dense matrix operation"; +def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> { + let summary = "Destroy dense tensor operation"; let description = [{ - The `gpu.destroy_dn_mat` operation releases all resources of a dense - matrix represented by a handle that was previously created by a - `gpu.create_dn_mat` operation. + The `gpu.destroy_dn_tensor` operation releases all resources of a dense + tensor represented by a handle that was previously created by a + `gpu.create_dn_tensor` operation. If the `async` keyword is present, the op is executed asynchronously (i.e. it does not block until the execution has finished on the device). In @@ -1703,17 +1642,17 @@ Example: ```mlir - %token = gpu.destroy_dn_vec async [%dep] %dmat + %token = gpu.destroy_dn_tensor async [%dep] %dnTensor ``` }]; let arguments = (ins Variadic:$asyncDependencies, - Arg:$dmat); + Arg:$dnTensor); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $dmat attr-dict + $dnTensor attr-dict }]; } @@ -1945,8 +1884,8 @@ GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, - GPU_SparseDnVecHandle:$dnX, - GPU_SparseDnVecHandle:$dnY, + GPU_SparseDnTensorHandle:$dnX, + GPU_SparseDnTensorHandle:$dnY, TypeAttr:$computeType); let results = (outs Res:$bufferSz, Optional:$asyncToken); @@ -1998,8 +1937,8 @@ GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, - GPU_SparseDnVecHandle:$dnX, - GPU_SparseDnVecHandle:$dnY, + GPU_SparseDnTensorHandle:$dnX, + GPU_SparseDnTensorHandle:$dnY, TypeAttr:$computeType, AnyMemRef:$buffer); let results = (outs Optional:$asyncToken); @@ -2052,8 +1991,8 @@ GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, - GPU_SparseDnMatHandle:$dnmatB, - GPU_SparseDnMatHandle:$dnmatC, + GPU_SparseDnTensorHandle:$dnmatB, + GPU_SparseDnTensorHandle:$dnmatC, TypeAttr:$computeType); let results = (outs Res]>>:$bufferSzs, @@ -2108,8 +2047,8 @@ GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, - GPU_SparseDnMatHandle:$dnmatB, - GPU_SparseDnMatHandle:$dnmatC, + GPU_SparseDnTensorHandle:$dnmatB, + GPU_SparseDnTensorHandle:$dnmatC, TypeAttr:$computeType, Variadic:$buffers); let results = (outs Optional:$asyncToken); @@ -2162,8 +2101,8 @@ GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, - GPU_SparseDnMatHandle:$dnmatA, - GPU_SparseDnMatHandle:$dnmatB, + GPU_SparseDnTensorHandle:$dnmatA, + GPU_SparseDnTensorHandle:$dnmatB, GPU_SparseSpMatHandle:$spmatC, TypeAttr:$computeType); let results = (outs Res:$bufferSz, Optional:$asyncToken); @@ -2216,8 +2155,8 @@ GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, - GPU_SparseDnMatHandle:$dnmatA, - GPU_SparseDnMatHandle:$dnmatB, + GPU_SparseDnTensorHandle:$dnmatA, + GPU_SparseDnTensorHandle:$dnmatB, GPU_SparseSpMatHandle:$spmatC, TypeAttr:$computeType, AnyMemRef:$buffer); diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -548,51 +548,31 @@ ConversionPatternRewriter &rewriter) const override; }; -class ConvertCreateDnVecOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { +class ConvertCreateDnTensorOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { public: - ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - -class ConvertDestroyDnVecOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} - -private: - LogicalResult - matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; - -class ConvertCreateDnMatOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { -public: - ConvertCreateDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + ConvertCreateDnTensorOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) { + } private: LogicalResult - matchAndRewrite(gpu::CreateDnMatOp op, OpAdaptor adaptor, + matchAndRewrite(gpu::CreateDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; -class ConvertDestroyDnMatOpToGpuRuntimeCallPattern - : public ConvertOpToGpuRuntimeCallPattern { +class ConvertDestroyDnTensorOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { public: - ConvertDestroyDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + ConvertDestroyDnTensorOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} private: LogicalResult - matchAndRewrite(gpu::DestroyDnMatOp op, OpAdaptor adaptor, + matchAndRewrite(gpu::DestroyDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; @@ -1474,102 +1454,90 @@ return success(); } -LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::CreateDnVecOp op, OpAdaptor adaptor, +LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - Value pVec = + Value pTensor = MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) - pVec = rewriter.create(loc, llvmPointerType, pVec); + pTensor = rewriter.create(loc, llvmPointerType, pTensor); Type dType = op.getMemref().getType().getElementType(); auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); - auto handle = - createDnVecCallBuilder - .create(loc, rewriter, {adaptor.getSize(), pVec, dtp, stream}) - .getResult(); - rewriter.replaceOp(op, {handle, stream}); - return success(); -} -LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::DestroyDnVecOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream}); - rewriter.replaceOp(op, {stream}); - return success(); -} + SmallVector dims; + for (Value dim : adaptor.getDims()) { + dims.push_back(dim); + } -LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::CreateDnMatOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || - failed(isAsyncWithOneDependency(rewriter, op))) - return failure(); - Location loc = op.getLoc(); - auto stream = adaptor.getAsyncDependencies().front(); - Value pMat = - MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); - if (!getTypeConverter()->useOpaquePointers()) - pMat = rewriter.create(loc, llvmPointerType, pMat); - Type dType = op.getMemref().getType().getElementType(); - auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); + Value handle; // TODO: For now, we track the use of the handle and lower it to cusparse / // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are // used, we require two separate Creation ops to be the correct logic. In // future, we may add support to using one handle in sparse tensor / GPU // dialect in both cusparse and cusparseLt. use the cusparseLt create call if // the dnmat is used with spmat with 2:4 sparsity - Value handle; - if (isSpMMCusparseLtOp(op.getDmat())) { - auto envHandle = adaptor.getEnv(); - AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); - auto handleSz = rewriter.create( - loc, getIndexType(), rewriter.getIndexAttr(11032)); - handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); - handle = rewriter.create(loc, llvmPointerType, handle); - - createLtDnMatCallBuilder - .create(loc, rewriter, - {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, - dtp, stream}) - .getResult(); + if (dims.size() == 2) { + if (isSpMMCusparseLtOp(op.getDnTensor())) { + auto envHandle = adaptor.getEnv(); + AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); + auto handleSz = rewriter.create( + loc, getIndexType(), rewriter.getIndexAttr(11032)); + handle = rewriter.create(loc, llvmInt8PointerType, + llvmInt8Type, handleSz); + handle = rewriter.create(loc, llvmPointerType, handle); + + createLtDnMatCallBuilder + .create(loc, rewriter, + {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream}) + .getResult(); + } else { + handle = + createDnMatCallBuilder + .create(loc, rewriter, {dims[0], dims[1], pTensor, dtp, stream}) + .getResult(); + } } else { - handle = - createDnMatCallBuilder - .create(loc, rewriter, - {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) - .getResult(); + assert(dims.size() == 1 && "Only 1D and 2D tensors are supported"); + handle = createDnVecCallBuilder + .create(loc, rewriter, {dims[0], pTensor, dtp, stream}) + .getResult(); } rewriter.replaceOp(op, {handle, stream}); return success(); } -LogicalResult ConvertDestroyDnMatOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::DestroyDnMatOp op, OpAdaptor adaptor, +LogicalResult ConvertDestroyDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroyDnTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); - // Use the cusparseLt destroy call if the dnmat is used with spmat with - // 2:4 sparsity - if (isSpMMCusparseLtOp(op.getDmat())) { - destroyCuSparseLtDnMatBuilder.create(loc, rewriter, - {adaptor.getDmat(), stream}); + auto definingOp = op.getDnTensor().getDefiningOp(); + SmallVector dims; + for (Value dim : definingOp.getDims()) { + dims.push_back(dim); + } + if (dims.size() == 2) { + // Use the cusparseLt destroy call if the dnmat is used with spmat with + // 2:4 sparsity + if (isSpMMCusparseLtOp(op.getDnTensor())) { + destroyCuSparseLtDnMatBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); + } else { + destroyDnMatCallBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); + } } else { - destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream}); + assert(dims.size() == 1 && "Only 1D and 2D tensors are supported"); + destroyDnVecCallBuilder.create(loc, rewriter, + {adaptor.getDnTensor(), stream}); } rewriter.replaceOp(op, {stream}); return success(); @@ -1914,8 +1882,7 @@ StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) { addOpaquePointerConversion(converter); - addOpaquePointerConversion(converter); - addOpaquePointerConversion(converter); + addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); @@ -1931,10 +1898,8 @@ ConvertAsyncYieldToGpuRuntimeCallPattern, ConvertCreateSparseEnvOpToGpuRuntimeCallPattern, ConvertDestroySparseEnvOpToGpuRuntimeCallPattern, - ConvertCreateDnVecOpToGpuRuntimeCallPattern, - ConvertDestroyDnVecOpToGpuRuntimeCallPattern, - ConvertCreateDnMatOpToGpuRuntimeCallPattern, - ConvertDestroyDnMatOpToGpuRuntimeCallPattern, + ConvertCreateDnTensorOpToGpuRuntimeCallPattern, + ConvertDestroyDnTensorOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, ConvertCreateCooAoSOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -147,8 +147,7 @@ addTypes(); addTypes(); addTypes(); - addTypes(); - addTypes(); + addTypes(); addTypes(); addOperations< #define GET_OP_LIST @@ -165,10 +164,8 @@ switch (kind) { case SparseHandleKind::Env: return "sparse.env_handle"; - case SparseHandleKind::DnVec: - return "sparse.dnvec_handle"; - case SparseHandleKind::DnMat: - return "sparse.dnmat_handle"; + case SparseHandleKind::DnTensor: + return "sparse.dntensor_handle"; case SparseHandleKind::SpMat: return "sparse.spmat_handle"; } @@ -221,10 +218,8 @@ if (keyword == getSparseHandleKeyword(SparseHandleKind::Env)) return SparseEnvHandleType::get(context); - if (keyword == getSparseHandleKeyword(SparseHandleKind::DnVec)) - return SparseDnVecHandleType::get(context); - if (keyword == getSparseHandleKeyword(SparseHandleKind::DnMat)) - return SparseDnMatHandleType::get(context); + if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor)) + return SparseDnTensorHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat)) return SparseSpMatHandleType::get(context); @@ -238,10 +233,9 @@ .Case([&](Type) { os << "async.token"; }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::Env); }) - .Case( - [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnVec); }) - .Case( - [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnMat); }) + .Case([&](Type) { + os << getSparseHandleKeyword(SparseHandleKind::DnTensor); + }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); }) .Case([&](MMAMatrixType fragTy) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -450,7 +450,7 @@ // Create sparse environment and sparse matrix/dense vector handles. Type indexTp = rewriter.getIndexType(); Type envHandleTp = rewriter.getType(); - Type dnVecHandleTp = rewriter.getType(); + Type dnTensorHandleTp = rewriter.getType(); Type spmatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); @@ -463,12 +463,12 @@ rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); - auto dvecX = rewriter.create(loc, dnVecHandleTp, tokenTp, - token, handle, vecX, szX); + auto dvecX = rewriter.create( + loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX); Value dnX = dvecX.getResult(0); token = dvecX.getAsyncToken(); - auto dvecY = rewriter.create(loc, dnVecHandleTp, tokenTp, - token, handle, vecY, szY); + auto dvecY = rewriter.create( + loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY); Value dnY = dvecY.getResult(0); token = dvecY.getAsyncToken(); @@ -493,9 +493,9 @@ // Copy data back to host and free all the resoures. token = rewriter.create(loc, tokenTp, token, spMatA) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, dnX) + token = rewriter.create(loc, tokenTp, token, dnX) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, dnY) + token = rewriter.create(loc, tokenTp, token, dnY) .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, handle) .getAsyncToken(); @@ -557,7 +557,7 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); Type envHandleTp = rewriter.getType(); - Type dnMatHandleTp = rewriter.getType(); + Type dnTensorHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); @@ -570,12 +570,14 @@ rowA, colA, valA, isCOO, enableRT); Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); - auto dmatB = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, szk, szn, matB); + auto dmatB = rewriter.create( + loc, dnTensorHandleTp, tokenTp, token, handle, matB, + SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); - auto dmatC = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, szm, szn, matC); + auto dmatC = rewriter.create( + loc, dnTensorHandleTp, tokenTp, token, handle, matC, + SmallVector{szm, szn}); Value dnC = dmatC.getResult(0); token = dmatC.getAsyncToken(); @@ -602,9 +604,9 @@ // Copy data back to host and free all the resoures. token = rewriter.create(loc, tokenTp, token, spMatA) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, dnB) + token = rewriter.create(loc, tokenTp, token, dnB) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, dnC) + token = rewriter.create(loc, tokenTp, token, dnC) .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, handle) .getAsyncToken(); diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -22,11 +22,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple into f16 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -22,11 +22,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_vec async [%token8] %dnvec + %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return @@ -52,11 +52,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return @@ -82,11 +82,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -332,14 +332,14 @@ %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_csr async %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - // CHECK: gpu.create_dn_vec async - %dnvec, %token6 = gpu.create_dn_vec async [%token5] %env, %mem2, %arg0 : memref + // CHECK: gpu.create_dn_tensor async + %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref // CHECK: gpu.spmv_buffer_size async %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64 // CHECK: gpu.spmv async %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 - // CHECK: gpu.create_dn_mat async - %dnmat, %token9 = gpu.create_dn_mat async [%token8] %env, %arg0, %arg0, %mem2 : memref + // CHECK: gpu.create_dn_tensor async + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref // CHECK: gpu.spmm_buffer_size async %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64 // CHECK: gpu.spmm async @@ -348,12 +348,12 @@ %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64 // CHECK: gpu.sddmm async %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 - // CHECK: gpu.destroy_dn_mat async - %token14 = gpu.destroy_dn_mat async [%token13] %dnmat + // CHECK: gpu.destroy_dn_tensor async + %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat // CHECK: gpu.destroy_sp_mat async %token15 = gpu.destroy_sp_mat async [%token14] %spmat - // CHECK: gpu.destroy_dn_vec async - %token16 = gpu.destroy_dn_vec async [%token15] %dnvec + // CHECK: gpu.destroy_dn_tensor async + %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec // CHECK: gpu.destroy_sparse_env async %token17 = gpu.destroy_sparse_env async [%token16] %env // CHECK: gpu.wait diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -8,11 +8,11 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_dn_vec async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return @@ -22,11 +22,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_vec async [%token8] %dnvec + %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return @@ -38,11 +38,11 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return @@ -52,11 +52,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return @@ -68,11 +68,11 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} // CHECK: gpu.wait [%{{.*}}] // CHECK: return @@ -82,11 +82,11 @@ %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat - %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env gpu.wait [%token10] return diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -47,14 +47,14 @@ // CHECK: %[[VAL_41:.*]] = gpu.wait async // CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] // CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref // CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref // CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] -// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]] -// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]] +// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]] +// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]] // CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]] // CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref // CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -45,14 +45,14 @@ // CHECK: %[[VAL_38:.*]] = gpu.wait async // CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : memref -// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : memref +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref +// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref // CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref // CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref // CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]] -// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]] -// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]] +// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]] +// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]] // CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]] // CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref // CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref