diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1612,18 +1612,20 @@
     Example:
 
     ```mlir
-    %dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref<?xf64>
+    %dvec, %token = gpu.create_dn_vec async [%dep] %env, %mem, %size : memref<?xf64>
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                   AnyMemRef:$memref, Index:$size);
-  let results = (outs Res<GPU_SparseDnVecHandle>:$dvec,
+                       GPU_SparseEnvHandle:$env,
+                       AnyMemRef:$memref, 
+                       Index:$size);
+  let results = (outs Res<GPU_SparseDnVecHandle>:$dvec, 
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $memref `,` $size attr-dict `:` type($memref)
+    $env `,` $memref `,` $size attr-dict `:` type($memref)
   }];
 }
 
@@ -1670,11 +1672,12 @@
     Example:
 
     ```mlir
-    %dmat, %token = gpu.create_dn_mat async [%dep] %mem, %size : memref<?xf64>
+    %dmat, %token = gpu.create_dn_mat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       GPU_SparseEnvHandle:$env,
                        Index:$rows,
                        Index:$cols,
                        AnyMemRef:$memref);
@@ -1682,7 +1685,7 @@
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+    $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
   }];
 }
 
@@ -1829,6 +1832,41 @@
   }];
 }
 
+
+def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix with 2:4 sparsity operation";
+  let description = [{
+    The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense 
+    format with 2:4 sparsity.
+    The buffers must already be copied from the host to the device prior to
+    using this operation. The operation returns a handle to the sparse
+    matrix descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       GPU_SparseEnvHandle:$env,
+                       Index:$rows,
+                       Index:$cols,
+                       AnyMemRef:$memref);
+  let results = (outs Res<GPU_SparseSpMatHandle>:$spMat, 
+                      Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+  }];
+}
+
 def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
   let summary = "Destroy sparse matrix operation";
   let description = [{
@@ -2005,7 +2043,7 @@
     Example:
 
     ```mlir
-    %buffersz, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC into f32
+    %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
     ```
   }];
 
@@ -2017,11 +2055,12 @@
                        GPU_SparseDnMatHandle:$dnmatB,
                        GPU_SparseDnMatHandle:$dnmatC,
                        TypeAttr:$computeType);
-  let results = (outs Res<Index>:$bufferSz,
+  let results = (outs Res<AnyTypeOf<[Index, TupleOf<[Index, Index, 
+                                                     Index]>]>>:$bufferSzs, 
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
-      "Type":$bufferSz,
+      "Type":$bufferSzs,
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
       "Value":$env,
@@ -2031,17 +2070,17 @@
       "Type":$computeType), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
+    return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies,
                  env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `into` $computeType
+    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
   }];
 }
 
-def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> {
+def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
   let summary = "SpMM operation";
   let description = [{
     The `gpu.spmm` operation performs the SpMM operation on the given sparse and
@@ -2060,7 +2099,7 @@
     Example:
 
     ```mlir
-    %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffer into f32
+    %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
     ```
   }];
 
@@ -2072,7 +2111,7 @@
                        GPU_SparseDnMatHandle:$dnmatB,
                        GPU_SparseDnMatHandle:$dnmatC,
                        TypeAttr:$computeType,
-                       AnyMemRef:$buffer);
+                       Variadic<AnyMemRef>:$buffers);
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
@@ -2083,16 +2122,16 @@
       "Value":$dnmatB,
       "Value":$dnmatC,
       "Type":$computeType,
-      "Value":$buffer), [{
+      "ValueRange":$buffers), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
     return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
-                 modeB, spmatA, dnmatB, dnmatC, computeType, buffer);}]>
+                 modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
   }];
 }
 
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -99,6 +99,10 @@
   Type llvmInt8Type = IntegerType::get(context, 8);
   Type llvmInt32Type = IntegerType::get(context, 32);
   Type llvmInt64Type = IntegerType::get(context, 64);
+  Type llvmInt8PointerType =
+      this->getTypeConverter()->getPointerType(llvmInt8Type);
+  Type llvmInt64PointerType =
+      this->getTypeConverter()->getPointerType(llvmInt64Type);
   Type llvmIntPtrType = IntegerType::get(
       context, this->getTypeConverter()->getPointerBitwidth(0));
 
@@ -275,6 +279,49 @@
       {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
        llvmPointerType /* void *stream */}};
+  FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = {
+      "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = {
+      "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = {
+      "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder createSparseLtEnvCallBuilder = {
+      "mgpuCreateSparseLtEnv",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroySparseLtEnvCallBuilder = {
+      "mgpuDestroySparseLtEnv",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createLtDnMatCallBuilder = {
+      "mgpuCreateCuSparseLtDnMat",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
+       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroyCuSparseLtSpMatBuilder = {
+      "mgpuDestroyCuSparseLtSpMat",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroyCuSparseLtDnMatBuilder = {
+      "mgpuDestroyCuSparseLtDnMat",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder create2To4SpMatCallBuilder = {
+      "mgpuCusparseLtCreate2To4SpMat",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
+       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = {
+      "mgpuCuSparseLtSpMMBufferSize",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmPointerType /*void *stream*/}};
+  FunctionCallBuilder cuSparseLtSpmmBuilder = {
+      "mgpuCuSparseLtSpMM",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmPointerType /*void *stream*/}};
 };
 
 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -577,6 +624,20 @@
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::Create2To4SpMatOp> {
+public:
+  ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::Create2To4SpMatOp>(
+            typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::Create2To4SpMatOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 class ConvertDestroySpMatOpToGpuRuntimeCallPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp> {
 public:
@@ -715,6 +776,15 @@
   // TODO: add support to CUSPARSE_INDEX_16U: 1
 }
 
+static int32_t getCuSparseLtDataTypeFrom(Type type) {
+  if (type.isF16())
+    return 0; // CUSPARSE_COMPUTE_16F,
+  if (type.isInteger(32))
+    return 1; // CUSPARSE_COMPUTE_32I
+  llvm_unreachable("unsupported type");
+  // TODO: add support to TF32
+}
+
 // Corresponding to cudaDataType_t defined in CUDA library_types.h.
 static int32_t getCuSparseDataTypeFrom(Type type) {
   if (llvm::isa<ComplexType>(type)) {
@@ -753,6 +823,39 @@
   llvm_unreachable("unsupported element type");
 }
 
+// TODO:  We may want a run-time (of the mlir compiler) disablement/warning:
+// cusparseLt currently won't work for cuda architecture <8.0 and will trigger a
+// runtime (of the CUDA program) error , but it might be great if we could at
+// least output a warning when we found the target architecture is <8.0 and the
+// user still wants to use cusparseLt. to make sure when lowering gpu sparse
+// dialect to llvm calls, the cusparselt calls are disabled for cuda
+// architecture <8.0
+static bool is2To4Sparsity(Value spMat) {
+  if (auto op = spMat.getDefiningOp<gpu::Create2To4SpMatOp>())
+    return true;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
+    return false;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
+    return false;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCooAoSOp>())
+    return false;
+  // Print the spMat defining op
+  spMat.getDefiningOp()->print(llvm::errs());
+  llvm_unreachable("cannot find spmat def");
+}
+
+static bool isSpMMCusparseLtOp(Value op) {
+  for (Operation *user : op.getUsers()) {
+    auto spmmOp = dyn_cast<gpu::SpMMOp>(user);
+    // If the other operator is 50% sparsity then we should use cusparseLt
+    if (!spmmOp)
+      continue;
+    if (is2To4Sparsity(spmmOp.getSpmatA()))
+      return true;
+  }
+  return false;
+}
+
 // Returns whether all operands are of LLVM type.
 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
                                      ConversionPatternRewriter &rewriter) {
@@ -1314,8 +1417,23 @@
     return failure();
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
-  auto handle =
-      createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+  // Use the cusparseLt create call if the dnmat is used with spmat with
+  // 2:4 sparsity
+  Value handle;
+  if (isSpMMCusparseLtOp(op.getEnv())) {
+    // Assert the size is 11024 bytes
+    AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {});
+    auto handleSz = rewriter.create<LLVM::ConstantOp>(
+        loc, getIndexType(), rewriter.getIndexAttr(11024));
+    handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
+                                             llvmInt8Type, handleSz);
+    handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+    createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream})
+        .getResult();
+  } else {
+    handle =
+        createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+  }
   rewriter.replaceOp(op, {handle, stream});
   return success();
 }
@@ -1328,7 +1446,15 @@
     return failure();
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
-  destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream});
+  // Use the cusparseLt destroy call if the dnmat is used with spmat with
+  // 2:4 sparsity
+  if (isSpMMCusparseLtOp(op.getEnv())) {
+    destroySparseLtEnvCallBuilder.create(loc, rewriter,
+                                         {adaptor.getEnv(), stream});
+  } else {
+    destroySparseEnvCallBuilder.create(loc, rewriter,
+                                       {adaptor.getEnv(), stream});
+  }
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1382,11 +1508,34 @@
     pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
   Type dType = op.getMemref().getType().getElementType();
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
-  auto handle =
-      createDnMatCallBuilder
-          .create(loc, rewriter,
-                  {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
-          .getResult();
+  // TODO: For now, we track the use of the handle and lower it to cusparse /
+  // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are
+  // used, we require two separate Creation ops to be the correct logic. In
+  // future, we may add support to using one handle in sparse tensor / GPU
+  // dialect in both cusparse and cusparseLt. use the cusparseLt create call if
+  // the dnmat is used with spmat with 2:4 sparsity
+  Value handle;
+  if (isSpMMCusparseLtOp(op.getDmat())) {
+    auto envHandle = adaptor.getEnv();
+    AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
+    auto handleSz = rewriter.create<LLVM::ConstantOp>(
+        loc, getIndexType(), rewriter.getIndexAttr(11032));
+    handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
+                                             llvmInt8Type, handleSz);
+    handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+
+    createLtDnMatCallBuilder
+        .create(loc, rewriter,
+                {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
+                 dtp, stream})
+        .getResult();
+  } else {
+    handle =
+        createDnMatCallBuilder
+            .create(loc, rewriter,
+                    {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
+            .getResult();
+  }
   rewriter.replaceOp(op, {handle, stream});
   return success();
 }
@@ -1399,7 +1548,14 @@
     return failure();
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
-  destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream});
+  // Use the cusparseLt destroy call if the dnmat is used with spmat with
+  // 2:4 sparsity
+  if (isSpMMCusparseLtOp(op.getDmat())) {
+    destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
+                                         {adaptor.getDmat(), stream});
+  } else {
+    destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream});
+  }
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1454,8 +1610,7 @@
     pIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pIdxs);
     pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
   }
-  Type iType =
-      llvm::cast<MemRefType>(op.getIdxs().getType()).getElementType();
+  Type iType = llvm::cast<MemRefType>(op.getIdxs().getType()).getElementType();
   Type dType =
       llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
   auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType));
@@ -1508,6 +1663,39 @@
   return success();
 }
 
+LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::Create2To4SpMatOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pMat =
+      MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers())
+    pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
+  Type dType =
+      llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType));
+  auto envHandle = adaptor.getEnv();
+
+  AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {});
+  auto handleSz = rewriter.create<LLVM::ConstantOp>(
+      loc, getIndexType(), rewriter.getIndexAttr(44104));
+  Value handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
+                                                 llvmInt8Type, handleSz);
+  handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+
+  create2To4SpMatCallBuilder
+      .create(loc, rewriter,
+              {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
+               dtp, stream})
+      .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
 LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::DestroySpMatOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1516,7 +1704,14 @@
     return failure();
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
-  destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream});
+  // Use the cusparseLt destroy call if the spmat is 2:4 sparsity
+  if (is2To4Sparsity(op.getSpmat())) {
+    destroyCuSparseLtSpMatBuilder.create(loc, rewriter,
+                                         {adaptor.getSpmat(), stream});
+
+  } else {
+    destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream});
+  }
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1577,14 +1772,29 @@
   auto stream = adaptor.getAsyncDependencies().front();
   auto computeType =
       genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
-
-  auto bufferSize = spMMBufferSizeCallBuilder
-                        .create(loc, rewriter,
-                                {adaptor.getEnv(), modeA, modeB,
-                                 adaptor.getSpmatA(), adaptor.getDnmatB(),
-                                 adaptor.getDnmatC(), computeType, stream})
-                        .getResult();
-  rewriter.replaceOp(op, {bufferSize, stream});
+  Value bufferSize;
+  if (is2To4Sparsity(op.getSpmatA())) {
+    auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
+                                                   rewriter.getIndexAttr(3));
+    bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
+                                                 llvmInt64Type, three);
+    bufferSize =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, bufferSize);
+
+    cuSparseLtSpmmBufferSizeBuilder
+        .create(loc, rewriter,
+                {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream})
+        .getResult();
+    rewriter.replaceOp(op, {bufferSize, stream});
+  } else {
+    bufferSize = spMMBufferSizeCallBuilder
+                     .create(loc, rewriter,
+                             {adaptor.getEnv(), modeA, modeB,
+                              adaptor.getSpmatA(), adaptor.getDnmatB(),
+                              adaptor.getDnmatC(), computeType, stream})
+                     .getResult();
+    rewriter.replaceOp(op, {bufferSize, stream});
+  }
   return success();
 }
 
@@ -1623,14 +1833,31 @@
       genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
 
   auto stream = adaptor.getAsyncDependencies().front();
-  Value pBuf =
-      MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
-  spMMCallBuilder.create(loc, rewriter,
-                         {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(),
-                          adaptor.getDnmatB(), adaptor.getDnmatC(), computeType,
-                          pBuf, stream});
+
+  // Lower to cusparseLt if applicable
+  if (is2To4Sparsity(op.getSpmatA())) {
+    SmallVector<Value> pBufs;
+    for (Value buffer : adaptor.getBuffers()) {
+      Value pBuf = MemRefDescriptor(buffer).allocatedPtr(rewriter, loc);
+      if (!getTypeConverter()->useOpaquePointers())
+        pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
+      pBufs.push_back(pBuf);
+    }
+    cuSparseLtSpmmBuilder.create(loc, rewriter,
+                                 {adaptor.getEnv(), adaptor.getSpmatA(),
+                                  adaptor.getDnmatB(), adaptor.getDnmatC(),
+                                  computeType, pBufs[0], pBufs[1], pBufs[2],
+                                  stream});
+  } else {
+    Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
+                     .allocatedPtr(rewriter, loc);
+    if (!getTypeConverter()->useOpaquePointers())
+      pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
+    spMMCallBuilder.create(loc, rewriter,
+                           {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(),
+                            adaptor.getDnmatB(), adaptor.getDnmatC(),
+                            computeType, pBuf, stream});
+  }
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1696,6 +1923,7 @@
                ConvertCreateCooOpToGpuRuntimeCallPattern,
                ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
                ConvertCreateCsrOpToGpuRuntimeCallPattern,
+               ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
                ConvertDestroySpMatOpToGpuRuntimeCallPattern,
                ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
                ConvertSpMVOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -464,11 +464,11 @@
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
   auto dvecX = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
-                                                   token, vecX, szX);
+                                                   token, handle, vecX, szX);
   Value dnX = dvecX.getResult(0);
   token = dvecX.getAsyncToken();
   auto dvecY = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
-                                                   token, vecY, szY);
+                                                   token, handle, vecY, szY);
   Value dnY = dvecY.getResult(0);
   token = dvecY.getAsyncToken();
 
@@ -570,12 +570,12 @@
                rowA, colA, valA, isCOO, enableRT);
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
-  auto dmatB = rewriter.create<gpu::CreateDnMatOp>(loc, dnMatHandleTp, tokenTp,
-                                                   token, szk, szn, matB);
+  auto dmatB = rewriter.create<gpu::CreateDnMatOp>(
+      loc, dnMatHandleTp, tokenTp, token, handle, szk, szn, matB);
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
-  auto dmatC = rewriter.create<gpu::CreateDnMatOp>(loc, dnMatHandleTp, tokenTp,
-                                                   token, szm, szn, matC);
+  auto dmatC = rewriter.create<gpu::CreateDnMatOp>(
+      loc, dnMatHandleTp, tokenTp, token, handle, szm, szn, matC);
   Value dnC = dmatC.getResult(0);
   token = dmatC.getAsyncToken();
 
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -191,8 +191,8 @@
     # We need the libcuda.so library.
     find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
 
-    # We need the libcusparse.so library.
-    find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+
+
 
     add_mlir_library(mlir_cuda_runtime
       SHARED
@@ -201,6 +201,7 @@
       EXCLUDE_FROM_LIBMLIR
     )
     set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)
+
     target_include_directories(mlir_cuda_runtime
       PRIVATE
       ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
@@ -208,8 +209,33 @@
     target_link_libraries(mlir_cuda_runtime
       PRIVATE
       ${CUDA_RUNTIME_LIBRARY}
-      ${CUDA_CUSPARSE_LIBRARY}
     )
+
+    if(MLIR_ENABLE_CUDA_CUSPARSE)
+      
+      # Find the libcusparse.so library if CUSPARSE build is requested.
+      find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+      target_link_libraries(mlir_cuda_runtime
+        PRIVATE
+        ${CUDA_CUSPARSE_LIBRARY}
+      )
+
+      if(MLIR_ENABLE_CUDA_CUSPARSELT)
+        # Find the libcusparseLt.so library in package manager default path if 
+        # CUSPARSELT build is requested. libcusparseLt.so provides sm80+ tensor 
+        # core support for 2:4 sparsity acceleration. 
+        find_library(CUDA_CUSPARSELT_LIBRARY cusparseLt HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+        find_path(CUDA_CUSPARSELT_HEADER cusparseLt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} REQUIRED)
+        target_include_directories(mlir_cuda_runtime
+          PRIVATE
+          ${CUDA_CUSPARSELT_HEADER}
+        )
+        target_link_libraries(mlir_cuda_runtime
+          PRIVATE
+          ${CUDA_CUSPARSELT_LIBRARY}
+        )
+      endif()
+    endif()
   endif()
 
   if(MLIR_ENABLE_ROCM_RUNNER)
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -19,7 +19,13 @@
 #include "cuda.h"
 #include "cuda_bf16.h"
 #include "cuda_fp16.h"
+
+#if MLIR_ENABLE_CUDA_CUSPARSE
 #include "cusparse.h"
+#if MLIR_ENABLE_CUDA_CUSPARSELT
+#include "cusparseLt.h"
+#endif // MLIR_ENABLE_CUDA_CUSPARSELT
+#endif // MLIR_ENABLE_CUDA_CUSPARSE
 
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
@@ -226,6 +232,8 @@
   defaultDevice = device;
 }
 
+#if MLIR_ENABLE_CUDA_CUSPARSE
+
 ///
 /// Wrapper methods for the cuSparse library.
 ///
@@ -445,3 +453,162 @@
                                          matB, betap, matC, cTp,
                                          CUSPARSE_SDDMM_ALG_DEFAULT, buf))
 }
+
+#if MLIR_ENABLE_CUDA_CUSPARSELT
+
+///
+/// Wrapper methods for the cuSparseLt library.
+///
+
+struct cusparseLtSpMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *values{nullptr};
+  // TODO: the following is associated with the SpMM operator rather than the
+  // sparse matrix. Create workspace buffers and pass them to the SpMM
+  // execution.
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulDescriptor_t matmul;
+};
+
+struct cusparseLtDnMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *values{nullptr};
+};
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() {
+  assert(sizeof(cusparseLtHandle_t) == 11024);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtSpMatHandleSize() {
+  return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() {
+  return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) {
+  // note that cuSparseLt still uses cusparseStatus_t
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtInit(reinterpret_cast<cusparseLtHandle_t *>(h)))
+  return;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) {
+  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols,
+                          void *values, int32_t dw, CUstream /*stream*/) {
+  cusparseLtMatDescriptor_t mat;
+  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
+  cudaDataType_t dtp = dataTp(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
+      handle, &(dh->mat), rows, cols, /*lda=*/cols,
+      /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW))
+  dnmat_handle->values = values;
+}
+
+// This can be used to destroy both dense matrices and sparse matrices in
+// cusparseLt
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
+  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData>(m);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
+  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols,
+                              void *values, int32_t dw, CUstream /*stream*/) {
+  auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
+  spmat_handle->values = values;
+  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  cudaDataType_t dtp = dataTp_cusparseLt(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
+      handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+}
+
+// Several things are being done in this stage, algorithm selection, planning,
+// and returning workspace and compressed matrices data buffer sizes.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size,
+                             void *compressed_buffer_size, void *h, void *a,
+                             CUstream /*stream*/) {
+  // TODO: support more advanced settings, e.g., the input right operand is a
+  // sparse matrix assuming matA is the sparse matrix
+  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit(
+      handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+  int alg = 0;
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+      handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+      sizeof(alg)))
+  // TODO: add transpose support
+  CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+      handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat),
+      &matB, &matC, &matC, compute_type))
+  CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul,
+                                          &(matWithData.alg_sel)))
+
+  CHECK_CUSPARSE(
+      cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size))
+  CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(
+      handle, &(matA.plan), compressed_size, compressed_buffer_size))
+
+  // avoid zero-alloc
+  *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size);
+  *compressed_size = (*compressed_size == 0 ? 1 : *compressed_size);
+  *compressed_buffer_size =
+      (*compressed_buffer_size == 0 ? 1 : *compressed_buffer_size);
+  return;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a,
+                   void *b, void *c, int32_t dw, void *buf, void *dA_compressed,
+                   void *dA_compressedBuffer, CUstream stream) {
+  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+  auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
+  auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
+
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulDescriptor_t matmul;
+
+  ALPHABETA(dw, alpha, beta)
+
+  CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values),
+                                         dA_compressed, dA_compressedBuffer,
+                                         stream))
+
+  // TODO: add support to multi-stream execution
+  // Perform the matrix multiplication. D = A*B+C using C==D for now
+  CHECK_CUSPARSE(
+      cusparseLtMatmul(handle, reinterpret_cast<cusparseLtMatmulPlan_t *>(plan),
+                       &alpha, dA_compressed, dB, &beta, matC->values,
+                       /*dD*/ matC->values, d_workspace, &stream, 1))
+
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  // destroy the plan associated with the sparse matrix
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan)))
+}
+
+#endif // MLIR_ENABLE_CUDA_CUSPARSELT
+#endif // MLIR_ENABLE_CUDA_CUSPARSE
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  // CHECK-LABEL: func @matmul
+  // CHECK: llvm.call @mgpuStreamCreate
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuCreateSparseLtEnv
+  // CHECK: llvm.call @mgpuCusparseLtCreate2To4SpMat
+  // CHECK: llvm.call @mgpuCreateCuSparseLtDnMat
+  // CHECK: llvm.call @mgpuCuSparseLtSpMMBufferSize
+  // CHECK: llvm.call @mgpuCuSparseLtSpMM
+  // CHECK: llvm.call @mgpuDestroyCuSparseLtSpMat
+  // CHECK: llvm.call @mgpuDestroyCuSparseLtDnMat
+  // CHECK: llvm.call @mgpuDestroySparseLtEnv
+  // CHECK: llvm.call @mgpuStreamSynchronize
+  // CHECK: llvm.call @mgpuStreamDestroy
+  func.func @matmul(%arg0: index) {
+    %token0 = gpu.wait async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xf16>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
+    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1:  memref<?xf16>
+    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf16>
+    %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple<index,index,index> into f16
+    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
+    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
+    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    gpu.wait [%token10]
+    return
+  }
+
+}
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -22,7 +22,7 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
+    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
     %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec  into f64
     %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
@@ -52,8 +52,8 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
-    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat into f64
+    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
     %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
@@ -82,7 +82,7 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
     %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
     %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -333,15 +333,15 @@
     // CHECK: gpu.create_csr async
     %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
     // CHECK: gpu.create_dn_vec async
-    %dnvec, %token6 = gpu.create_dn_vec async [%token5] %mem2, %arg0 : memref<?xf64>
+    %dnvec, %token6 = gpu.create_dn_vec async [%token5] %env, %mem2, %arg0 : memref<?xf64>
     // CHECK: gpu.spmv_buffer_size async
     %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec  into f64
     // CHECK: gpu.spmv async
     %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
     // CHECK: gpu.create_dn_mat async
-    %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token9 = gpu.create_dn_mat async [%token8] %env, %arg0, %arg0, %mem2 : memref<?xf64>
     // CHECK: gpu.spmm_buffer_size async
-    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat  into f64
+    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64
     // CHECK: gpu.spmm async
     %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>  into f64
     // CHECK: gpu.sddmm_buffer_size async
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -8,7 +8,7 @@
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
   // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
@@ -22,7 +22,7 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
+    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
     %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
     %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
@@ -38,7 +38,7 @@
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
   // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
@@ -52,8 +52,8 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
-    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat into f64
+    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
     %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
@@ -68,7 +68,7 @@
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
   // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}  into f64
   // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>  into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
@@ -82,7 +82,7 @@
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
     %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
     %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -47,9 +47,9 @@
 // CHECK:           %[[VAL_41:.*]] = gpu.wait async
 // CHECK:           %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
 // CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
-// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
-// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
+// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
+// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
 // CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
 // CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -45,8 +45,8 @@
 // CHECK:           %[[VAL_38:.*]] = gpu.wait async
 // CHECK:           %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]]
 // CHECK:           %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : memref<?xf64>
-// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : memref<?xf64>
+// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : memref<?xf64>
+// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : memref<?xf64>
 // CHECK:           %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
 // CHECK:           %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
 // CHECK:           %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7880,6 +7880,10 @@
     srcs = ["lib/ExecutionEngine/CudaRuntimeWrappers.cpp"],
     # Prevent needing EnableABIBreakingChecks symbol from LLVMSupport.
     copts = ["-DLLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING=1"],
+    # Here:
+    #   MLIR_ENABLE_CUDA_CUSPARSE   : enables cuSPARSE
+    #   MLIR_ENABLE_CUDA_CUSPARSELT : enables cuSPARSElt
+    local_defines = ["MLIR_ENABLE_CUDA_CUSPARSE"],
     tags = [
         "manual",  # External dependency
         "nobuildkite",  # TODO(gcmn): Add support for this target