diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -128,6 +128,13 @@
     "dense matrix handle type">,
   BuildableType<"mlir::gpu::SparseDnMatHandleType::get($_builder.getContext())">;
 
+// TODO: remove the above two
+def GPU_SparseDnTensorHandle :
+  DialectType<GPU_Dialect,
+    CPred<"llvm::isa<::mlir::gpu::SparseDnTensorHandleType>($_self)">,
+    "dense tensor handle type">,
+  BuildableType<"mlir::gpu::SparseDnTensorHandleType::get($_builder.getContext())">;
+
 def GPU_SparseSpMatHandle :
   DialectType<GPU_Dialect,
     CPred<"llvm::isa<::mlir::gpu::SparseSpMatHandleType>($_self)">,
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -164,8 +164,9 @@
 // Adds a `gpu.async.token` to the front of the argument list.
 void addAsyncDependency(Operation *op, Value token);
 
+// TODO: remove the two enums
 // Handle types for sparse.
-enum class SparseHandleKind { Env, DnVec, DnMat, SpMat };
+enum class SparseHandleKind { Env, DnVec, DnMat, SpMat, DnTensor };
 
 template <SparseHandleKind K>
 class SparseHandleType
@@ -179,6 +180,8 @@
 using SparseEnvHandleType = SparseHandleType<SparseHandleKind::Env>;
 using SparseDnVecHandleType = SparseHandleType<SparseHandleKind::DnVec>;
 using SparseDnMatHandleType = SparseHandleType<SparseHandleKind::DnMat>;
+// remove the above two
+using SparseDnTensorHandleType = SparseHandleType<SparseHandleKind::DnTensor>;
 using SparseSpMatHandleType = SparseHandleType<SparseHandleKind::SpMat>;
 
 } // namespace gpu
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1717,6 +1717,65 @@
   }];
 }
 
+def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
+  let summary = "Create dense tensor operation";
+  let description = [{
+    The `gpu.create_dn_tensor` operation initializes a dense tensor from
+    the given values buffer and sizes. The buffer must already be copied
+    from the host to the device prior to using this operation. The
+    operation returns a handle to the dense tensor descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %dims, %mem : memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       GPU_SparseEnvHandle:$env,
+                       Variadic<Index>:$dims,
+                       AnyMemRef:$memref);
+  let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $env `,` $dims `,` $memref attr-dict `:` type($dims) `,` type($memref)
+  }];
+}
+
+def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> {
+  let summary = "Destroy dense tensor operation";
+  let description = [{
+    The `gpu.destroy_dn_tensor` operation releases all resources of a dense
+    tensor represented by a handle that was previously created by a
+    `gpu.create_dn_tensor` operation.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.destroy_dn_tensor async [%dep] %dnTensor
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       Arg<GPU_SparseDnTensorHandle>:$dnTensor);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $dnTensor attr-dict
+  }];
+}
+
 def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
   let summary = "Create sparse matrix in COO format operation";
   let description = [{
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -548,6 +548,35 @@
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertCreateDnTensorOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp> {
+public:
+  ConvertCreateDnTensorOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp>(typeConverter) {
+  }
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateDnTensorOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroyDnTensorOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp> {
+public:
+  ConvertDestroyDnTensorOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp>(
+            typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+// TODO: remove the following two
 class ConvertCreateDnVecOpToGpuRuntimeCallPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
 public:
@@ -1509,6 +1538,100 @@
   return success();
 }
 
+LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateDnTensorOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pTensor =
+      MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers())
+    pTensor = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pTensor);
+  Type dType = op.getMemref().getType().getElementType();
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+
+  SmallVector<Value, 4> dims;
+  for (Value dim : adaptor.getDims()) {
+    // dims.push_back(rewriter.create<LLVM::PtrToIntOp>(
+    //     loc, getIndexType(), rewriter.create<LLVM::BitcastOp>(
+    //                              loc, llvmInt64Type, dim)));
+    dims.push_back(dim);
+  }
+
+  Value handle;
+  // TODO: For now, we track the use of the handle and lower it to cusparse /
+  // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are
+  // used, we require two separate Creation ops to be the correct logic. In
+  // future, we may add support to using one handle in sparse tensor / GPU
+  // dialect in both cusparse and cusparseLt. use the cusparseLt create call if
+  // the dnmat is used with spmat with 2:4 sparsity
+  if (dims.size() == 2) {
+    if (isSpMMCusparseLtOp(op.getDnTensor())) {
+      auto envHandle = adaptor.getEnv();
+      AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
+      auto handleSz = rewriter.create<LLVM::ConstantOp>(
+          loc, getIndexType(), rewriter.getIndexAttr(11032));
+      handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
+                                               llvmInt8Type, handleSz);
+      handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+
+      createLtDnMatCallBuilder
+          .create(loc, rewriter,
+                  {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream})
+          .getResult();
+    } else {
+      handle =
+          createDnMatCallBuilder
+              .create(loc, rewriter, {dims[0], dims[1], pTensor, dtp, stream})
+              .getResult();
+    }
+  } else {
+    assert(dims.size() == 1 && "Only 1D and 2D tensors are supported");
+    handle = createDnVecCallBuilder
+                 .create(loc, rewriter, {dims[0], pTensor, dtp, stream})
+                 .getResult();
+  }
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertDestroyDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  auto definingOp = op.getDnTensor().getDefiningOp<gpu::CreateDnTensorOp>();
+  SmallVector<Value, 4> dims;
+  for (Value dim : definingOp.getDims()) {
+    // dims.push_back(rewriter.create<LLVM::PtrToIntOp>(
+    //     loc, getIndexType(), rewriter.create<LLVM::BitcastOp>(
+    //                              loc, llvmInt64Type, dim)));
+    dims.push_back(dim);
+  }
+  if (dims.size() == 2) {
+    // Use the cusparseLt destroy call if the dnmat is used with spmat with
+    // 2:4 sparsity
+    if (isSpMMCusparseLtOp(op.getDnTensor())) {
+      destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
+                                           {adaptor.getDnTensor(), stream});
+    } else {
+      destroyDnMatCallBuilder.create(loc, rewriter,
+                                     {adaptor.getDnTensor(), stream});
+    }
+  } else {
+    destroyDnVecCallBuilder.create(loc, rewriter,
+                                   {adaptor.getDnTensor(), stream});
+  }
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
 LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::CreateDnMatOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1916,6 +2039,8 @@
   addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
   addOpaquePointerConversion<gpu::SparseDnVecHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseDnMatHandleType>(converter);
+  // TODO: remove the above two
+  addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseEnvHandleType>(converter);
 
@@ -1935,6 +2060,9 @@
                ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
                ConvertCreateDnMatOpToGpuRuntimeCallPattern,
                ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
+               // TODO: remove the above four
+               ConvertCreateDnTensorOpToGpuRuntimeCallPattern,
+               ConvertDestroyDnTensorOpToGpuRuntimeCallPattern,
                ConvertCreateCooOpToGpuRuntimeCallPattern,
                ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
                ConvertCreateCsrOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -149,6 +149,8 @@
   addTypes<SparseEnvHandleType>();
   addTypes<SparseDnVecHandleType>();
   addTypes<SparseDnMatHandleType>();
+  // TODO: remove the above two
+  addTypes<SparseDnTensorHandleType>();
   addTypes<SparseSpMatHandleType>();
   addOperations<
 #define GET_OP_LIST
@@ -169,6 +171,9 @@
     return "sparse.dnvec_handle";
   case SparseHandleKind::DnMat:
     return "sparse.dnmat_handle";
+  // remove thw above two
+  case SparseHandleKind::DnTensor:
+    return "sparse.dntensor_handle";
   case SparseHandleKind::SpMat:
     return "sparse.spmat_handle";
   }
@@ -225,6 +230,9 @@
     return SparseDnVecHandleType::get(context);
   if (keyword == getSparseHandleKeyword(SparseHandleKind::DnMat))
     return SparseDnMatHandleType::get(context);
+  // TODO: remove the above two
+  if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor))
+    return SparseDnTensorHandleType::get(context);
   if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat))
     return SparseSpMatHandleType::get(context);
 
@@ -242,6 +250,10 @@
           [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnVec); })
       .Case<SparseDnMatHandleType>(
           [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnMat); })
+      // TODO: remove the above two
+      .Case<SparseDnTensorHandleType>([&](Type) {
+        os << getSparseHandleKeyword(SparseHandleKind::DnTensor);
+      })
       .Case<SparseSpMatHandleType>(
           [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); })
       .Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -557,6 +557,7 @@
   // Create sparse environment and sparse matrix/dense matrix handles.
   Type indexTp = rewriter.getIndexType();
   Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
+  // TODO: replace with DnTensor
   Type dnMatHandleTp = rewriter.getType<gpu::SparseDnMatHandleType>();
   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();