diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1540,13 +1540,28 @@
 // Operation on sparse matrices, called from the host
 // (currently lowers to cuSparse for CUDA only, no ROCM lowering).
 //
+def GPU_RtLibMode: I32EnumAttr<"RtLibMode",
+    "optional GPU runtime libraries to be enabled to support sparse ops",
+    [
+      I32EnumAttrCase<"CUSPARSE_AND_CUSPARSE_LT", 0>,
+      I32EnumAttrCase<"CUSPARSE", 1>,
+    ]>{
+      let genSpecializedAttr = 0;
+      let cppNamespace = GPU_Dialect.cppNamespace;
+}
+
+def GPU_RtLibModeAttr : EnumAttr<GPU_Dialect, GPU_RtLibMode,
+                                   "rtlib_mode">{
+  let defaultValue = "RtLibMode::CUSPARSE_AND_CUSPARSE_LT";
+}
 
 def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> {
   let summary = "Create sparse environment operation";
   let description = [{
-    The `gpu.create_sparse_env` operation initializes a sparse environment.
+    The `gpu.initialize_sparse_env` operation initializes a sparse environment.
     It must be executed prior to any other sparse operation. The operation
-    returns a handle to the new sparse environment.
+    takes in the RtLibMode argument to indicate whether cuSparse and cuSparseLt 
+    will be initialized, respectively.
 
     If the `async` keyword is present, the op is executed asynchronously (i.e.
     it does not block until the execution has finished on the device). In
@@ -1555,15 +1570,15 @@
     Example:
 
     ```mlir
-    %env, %token = gpu.create_sparse_env async [%dep]
+    %token = gpu.create_sparse_env async [%dep] %rtLibMode
     ```
   }];
 
-  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
-  let results = (outs Res<GPU_SparseEnvHandle>:$env,
-                      Optional<GPU_AsyncToken>:$asyncToken);
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       Arg<GPU_RtLibModeAttr>:$rtLibMode);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
   let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $rtLibMode attr-dict
   }];
 }
 
@@ -1573,8 +1588,8 @@
   let summary = "Destroy sparse environment operation";
   let description = [{
     The `gpu.destroy_sparse_env` operation releases all resources of a sparse
-    environment represented by a handle that was previously created by a
-    `gpu.create_sparse_env` operation.
+    environment represented by the GPU_RtLibMode flag indicating whether cuSparse 
+    environment and cuSparseLt's will be destroyed, respectively.
 
     If the `async` keyword is present, the op is executed asynchronously (i.e.
     it does not block until the execution has finished on the device). In
@@ -1583,17 +1598,17 @@
     Example:
 
     ```mlir
-    %token = gpu.destroy_sparse_env async [%dep] %env
+    %token = gpu.destroy_sparse_env async [%dep] %rtLibMode
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       Arg<GPU_SparseEnvHandle>:$env);
+                       Arg<GPU_RtLibModeAttr>:$rtLibMode);
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env attr-dict
+    $rtLibMode attr-dict
   }];
 }
 
@@ -1612,19 +1627,18 @@
     Example:
 
     ```mlir
-    %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref<?xf64>
+    %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref<?xf64>
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        AnyMemRef:$memref,
                        Variadic<Index>:$dims);
   let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env  `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
+    $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
   }];
 }
 
@@ -1788,12 +1802,11 @@
     Example:
 
     ```mlir
-    %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
+    %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref<?xf64>
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        Index:$rows,
                        Index:$cols,
                        AnyMemRef:$memref);
@@ -1802,7 +1815,7 @@
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+    $rows `,` $cols `,` $memref attr-dict `:` type($memref)
   }];
 }
 
@@ -1877,11 +1890,10 @@
     Example:
 
     ```mlir
-    %buffersz, %token = gpu.spmv_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY into f32
+    %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32
     ```
   }];
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        GPU_TransposeModeAttr:$modeA,
                        GPU_SparseSpMatHandle:$spmatA,
                        GPU_SparseDnTensorHandle:$dnX,
@@ -1894,7 +1906,6 @@
       "Type":$bufferSz,
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
-      "Value":$env,
       "Value":$spmatA,
       "Value":$dnX,
       "Value":$dnY,
@@ -1902,12 +1913,12 @@
       , [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
-                 env, modeA, spmatA, dnX, dnY, computeType);}]>
+                 modeA, spmatA, dnX, dnY, computeType);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict  `into` $computeType
+    $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict  `into` $computeType
   }];
 }
 
@@ -1930,11 +1941,10 @@
     Example:
 
     ```mlir
-    %token = gpu.spmv async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
+    %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref<?xf64> into bf16
     ```
   }];
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        GPU_TransposeModeAttr:$modeA,
                        GPU_SparseSpMatHandle:$spmatA,
                        GPU_SparseDnTensorHandle:$dnX,
@@ -1946,20 +1956,19 @@
   let builders = [OpBuilder<(ins
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
-      "Value":$env,
       "Value":$spmatA,
       "Value":$dnX,
       "Value":$dnY,
       "Type":$computeType,
       "Value":$buffer), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+    return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
                  spmatA, dnX, dnY, computeType, buffer);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+    $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType
   }];
 }
 
@@ -1982,12 +1991,11 @@
     Example:
 
     ```mlir
-    %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
+    %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        GPU_TransposeModeAttr:$modeA,
                        GPU_TransposeModeAttr:$modeB,
                        GPU_SparseSpMatHandle:$spmatA,
@@ -2001,7 +2009,6 @@
       "Type":$bufferSzs,
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
-      "Value":$env,
       "Value":$spmatA,
       "Value":$dnmatB,
       "Value":$dnmatC,
@@ -2009,12 +2016,12 @@
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
     return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies,
-                 env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
+                 modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
+    $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType
   }];
 }
 
@@ -2037,12 +2044,11 @@
     Example:
 
     ```mlir
-    %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
+    %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32
     ```
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
                        GPU_TransposeModeAttr:$modeA,
                        GPU_TransposeModeAttr:$modeB,
                        GPU_SparseSpMatHandle:$spmatA,
@@ -2055,7 +2061,6 @@
   let builders = [OpBuilder<(ins
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
-      "Value":$env,
       "Value":$spmatA,
       "Value":$dnmatB,
       "Value":$dnmatC,
@@ -2063,13 +2068,13 @@
       "ValueRange":$buffers), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+    return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
                  modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
+    $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType
   }];
 }
 
@@ -2088,7 +2093,7 @@
     Example:
 
     ```mlir
-    %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
+    %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
     ```
 
     The matrix arguments can also be associated with one of the following
@@ -2097,7 +2102,6 @@
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                   GPU_SparseEnvHandle:$env,
                    GPU_TransposeModeAttr:$modeA,
                    GPU_TransposeModeAttr:$modeB,
                    GPU_SparseDnTensorHandle:$dnmatA,
@@ -2110,7 +2114,6 @@
       "Type":$bufferSz,
       "Type":$asyncToken,
       "ValueRange":$asyncDependencies,
-      "Value":$env,
       "Value":$dnmatA,
       "Value":$dnmatB,
       "Value":$spmatC,
@@ -2118,12 +2121,12 @@
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
     return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
-                 env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
+                 modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
+    $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
   }];
 }
 
@@ -2142,7 +2145,7 @@
     Example:
 
     ```mlir
-    %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
+    %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32
     ```
 
     The matrix arguments can also be associated with one of the following
@@ -2151,7 +2154,6 @@
   }];
 
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                   GPU_SparseEnvHandle:$env,
                    GPU_TransposeModeAttr:$modeA,
                    GPU_TransposeModeAttr:$modeB,
                    GPU_SparseDnTensorHandle:$dnmatA,
@@ -2164,7 +2166,6 @@
   let builders = [OpBuilder<(ins
     "Type":$asyncToken,
     "ValueRange":$asyncDependencies,
-    "Value":$env,
     "Value":$dnmatA,
     "Value":$dnmatB,
     "Value":$spmatC,
@@ -2172,13 +2173,13 @@
     "Value":$buffer), [{
   auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
   auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-  return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA,
+  return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
                 modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+    $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
   }];
 }
 
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -206,12 +206,12 @@
       {llvmInt32Type /* uint32_t devIndex */}};
   FunctionCallBuilder createSparseEnvCallBuilder = {
       "mgpuCreateSparseEnv",
-      llvmPointerType,
+      llvmVoidType,
       {llvmPointerType /* void *stream */}};
   FunctionCallBuilder destroySparseEnvCallBuilder = {
       "mgpuDestroySparseEnv",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType /* void *stream */}};
+      {llvmPointerType /* void *stream */}};
   FunctionCallBuilder createDnVecCallBuilder = {
       "mgpuCreateDnVec",
       llvmPointerType,
@@ -255,51 +255,48 @@
   FunctionCallBuilder spMVBufferSizeCallBuilder = {
       "mgpuSpMVBufferSize",
       llvmIntPtrType,
-      {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType,
-       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+      {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmInt32Type, llvmPointerType /* void *stream */}};
   FunctionCallBuilder spMVCallBuilder = {
       "mgpuSpMV",
       llvmVoidType,
-      {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType,
-       llvmPointerType, llvmInt32Type, llvmPointerType,
-       llvmPointerType /* void *stream */}};
+      {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}};
   FunctionCallBuilder createSpMMBufferSizeCallBuilder = {
       "mgpuSpMMBufferSize",
       llvmIntPtrType,
-      {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmInt32Type,
-       llvmPointerType /* void *stream */}};
+      {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
   FunctionCallBuilder createSpMMCallBuilder = {
       "mgpuSpMM",
       llvmVoidType,
-      {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
+      {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmInt32Type, llvmPointerType,
        llvmPointerType /* void *stream */}};
   FunctionCallBuilder createSDDMMBufferSizeCallBuilder = {
       "mgpuSDDMMBufferSize",
       llvmIntPtrType,
-      {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmInt32Type,
-       llvmPointerType /* void *stream */}};
+      {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
   FunctionCallBuilder createSDDMMCallBuilder = {
       "mgpuSDDMM",
       llvmVoidType,
-      {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
+      {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmInt32Type, llvmPointerType,
        llvmPointerType /* void *stream */}};
   FunctionCallBuilder createSparseLtEnvCallBuilder = {
       "mgpuCreateSparseLtEnv",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType /* void *stream */}};
+      {llvmPointerType /* void *stream */}};
   FunctionCallBuilder destroySparseLtEnvCallBuilder = {
       "mgpuDestroySparseLtEnv",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType /* void *stream */}};
+      {llvmPointerType /* void *stream */}};
   FunctionCallBuilder createLtDnMatCallBuilder = {
       "mgpuCreateCuSparseLtDnMat",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
-       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+      {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmInt32Type, llvmPointerType /* void *stream */}};
   FunctionCallBuilder destroyCuSparseLtSpMatBuilder = {
       "mgpuDestroyCuSparseLtSpMat",
       llvmVoidType,
@@ -311,20 +308,19 @@
   FunctionCallBuilder create2To4SpMatCallBuilder = {
       "mgpuCusparseLtCreate2To4SpMat",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType,
-       llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}};
+      {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmInt32Type, llvmPointerType /* void *stream */}};
   FunctionCallBuilder createCuSparseLtSpMMBufferSizeBuilder = {
       "mgpuCuSparseLtSpMMBufferSize",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
-       llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type,
+      {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmInt32Type,
        llvmPointerType /*void *stream*/}};
   FunctionCallBuilder createCuSparseLtSpMMBuilder = {
       "mgpuCuSparseLtSpMM",
       llvmVoidType,
       {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
-       llvmPointerType, llvmPointerType, llvmPointerType,
-       llvmPointerType /*void *stream*/}};
+       llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}};
 };
 
 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -1403,21 +1399,15 @@
   auto stream = adaptor.getAsyncDependencies().front();
   // Use the cusparseLt create call if the dnmat is used with spmat with
   // 2:4 sparsity
-  Value handle;
-  if (isSpMMCusparseLtOp(op.getEnv())) {
+  if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT) {
     // CUDA runner asserts the size is 11024 bytes.
-    auto handleSz = rewriter.create<LLVM::ConstantOp>(
-        loc, getIndexType(), rewriter.getIndexAttr(11024));
-    handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
-                                             llvmInt8Type, handleSz);
-    handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
-    createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream})
-        .getResult();
-  } else {
-    handle =
-        createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+    createSparseLtEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
   }
-  rewriter.replaceOp(op, {handle, stream});
+  if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT ||
+      op.getRtLibMode() == gpu::RtLibMode::CUSPARSE) {
+    createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+  }
+  rewriter.replaceOp(op, {stream});
   return success();
 }
 
@@ -1431,12 +1421,12 @@
   auto stream = adaptor.getAsyncDependencies().front();
   // Use the cusparseLt destroy call if the dnmat is used with spmat with
   // 2:4 sparsity
-  if (isSpMMCusparseLtOp(op.getEnv())) {
-    destroySparseLtEnvCallBuilder.create(loc, rewriter,
-                                         {adaptor.getEnv(), stream});
-  } else {
-    destroySparseEnvCallBuilder.create(loc, rewriter,
-                                       {adaptor.getEnv(), stream});
+  if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT) {
+    destroySparseLtEnvCallBuilder.create(loc, rewriter, {stream});
+  }
+  if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT ||
+      op.getRtLibMode() == gpu::RtLibMode::CUSPARSE) {
+    destroySparseEnvCallBuilder.create(loc, rewriter, {stream});
   }
   rewriter.replaceOp(op, {stream});
   return success();
@@ -1471,7 +1461,6 @@
   // the dnmat is used with spmat with 2:4 sparsity
   if (dims.size() == 2) {
     if (isSpMMCusparseLtOp(op.getDnTensor())) {
-      auto envHandle = adaptor.getEnv();
       auto handleSz = rewriter.create<LLVM::ConstantOp>(
           loc, getIndexType(), rewriter.getIndexAttr(11032));
       handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1480,7 +1469,7 @@
 
       createLtDnMatCallBuilder
           .create(loc, rewriter,
-                  {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream})
+                  {handle, dims[0], dims[1], pTensor, dtp, stream})
           .getResult();
     } else {
       handle =
@@ -1648,7 +1637,6 @@
   Type dType =
       llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
-  auto envHandle = adaptor.getEnv();
 
   // CUDA runner asserts the size is 44104 bytes.
   auto handleSz = rewriter.create<LLVM::ConstantOp>(
@@ -1659,8 +1647,7 @@
 
   create2To4SpMatCallBuilder
       .create(loc, rewriter,
-              {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
-               dtp, stream})
+              {handle, adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
       .getResult();
   rewriter.replaceOp(op, {handle, stream});
   return success();
@@ -1697,12 +1684,11 @@
   auto computeType = genConstInt32From(
       rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto stream = adaptor.getAsyncDependencies().front();
-  auto bufferSize =
-      spMVBufferSizeCallBuilder
-          .create(loc, rewriter,
-                  {adaptor.getEnv(), modeA, adaptor.getSpmatA(),
-                   adaptor.getDnX(), adaptor.getDnY(), computeType, stream})
-          .getResult();
+  auto bufferSize = spMVBufferSizeCallBuilder
+                        .create(loc, rewriter,
+                                {modeA, adaptor.getSpmatA(), adaptor.getDnX(),
+                                 adaptor.getDnY(), computeType, stream})
+                        .getResult();
   rewriter.replaceOp(op, {bufferSize, stream});
   return success();
 }
@@ -1723,9 +1709,8 @@
   if (!getTypeConverter()->useOpaquePointers())
     pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
   spMVCallBuilder.create(loc, rewriter,
-                         {adaptor.getEnv(), modeA, adaptor.getSpmatA(),
-                          adaptor.getDnX(), adaptor.getDnY(), computeType, pBuf,
-                          stream});
+                         {modeA, adaptor.getSpmatA(), adaptor.getDnX(),
+                          adaptor.getDnY(), computeType, pBuf, stream});
   rewriter.replaceOp(op, {stream});
   return success();
 }
@@ -1750,9 +1735,8 @@
                                                       llvmInt64Type, three);
     createCuSparseLtSpMMBufferSizeBuilder
         .create(loc, rewriter,
-                {bufferSize, adaptor.getEnv(), modeA, modeB,
-                 adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
-                 computeType, stream})
+                {bufferSize, modeA, modeB, adaptor.getSpmatA(),
+                 adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream})
         .getResult();
 
     auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
@@ -1774,12 +1758,12 @@
   } else {
     auto computeType = genConstInt32From(
         rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
-    bufferSize = createSpMMBufferSizeCallBuilder
-                     .create(loc, rewriter,
-                             {adaptor.getEnv(), modeA, modeB,
-                              adaptor.getSpmatA(), adaptor.getDnmatB(),
-                              adaptor.getDnmatC(), computeType, stream})
-                     .getResult();
+    bufferSize =
+        createSpMMBufferSizeCallBuilder
+            .create(loc, rewriter,
+                    {modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(),
+                     adaptor.getDnmatC(), computeType, stream})
+            .getResult();
     rewriter.replaceOp(op, {bufferSize, stream});
   }
   return success();
@@ -1797,12 +1781,12 @@
   auto computeType = genConstInt32From(
       rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto stream = adaptor.getAsyncDependencies().front();
-  auto bufferSize = createSDDMMBufferSizeCallBuilder
-                        .create(loc, rewriter,
-                                {adaptor.getEnv(), modeA, modeB,
-                                 adaptor.getDnmatA(), adaptor.getDnmatB(),
-                                 adaptor.getSpmatC(), computeType, stream})
-                        .getResult();
+  auto bufferSize =
+      createSDDMMBufferSizeCallBuilder
+          .create(loc, rewriter,
+                  {modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(),
+                   adaptor.getSpmatC(), computeType, stream})
+          .getResult();
   rewriter.replaceOp(op, {bufferSize, stream});
   return success();
 }
@@ -1832,17 +1816,17 @@
     }
     createCuSparseLtSpMMBuilder.create(
         loc, rewriter,
-        {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(),
-         adaptor.getDnmatC(), pBufs[0], pBufs[1], pBufs[2], stream});
+        {adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
+         pBufs[0], pBufs[1], pBufs[2], stream});
   } else {
     Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
                      .allocatedPtr(rewriter, loc);
     if (!getTypeConverter()->useOpaquePointers())
       pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
-    createSpMMCallBuilder.create(
-        loc, rewriter,
-        {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(),
-         adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, pBuf, stream});
+    createSpMMCallBuilder.create(loc, rewriter,
+                                 {modeA, modeB, adaptor.getSpmatA(),
+                                  adaptor.getDnmatB(), adaptor.getDnmatC(),
+                                  computeType, pBuf, stream});
   }
   rewriter.replaceOp(op, {stream});
   return success();
@@ -1872,10 +1856,10 @@
       MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
   if (!getTypeConverter()->useOpaquePointers())
     pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
-  createSDDMMCallBuilder.create(
-      loc, rewriter,
-      {adaptor.getEnv(), modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(),
-       adaptor.getSpmatC(), computeType, pBuf, stream});
+  createSDDMMCallBuilder.create(loc, rewriter,
+                                {modeA, modeB, adaptor.getDnmatA(),
+                                 adaptor.getDnmatB(), adaptor.getSpmatC(),
+                                 computeType, pBuf, stream});
   rewriter.replaceOp(op, {stream});
   return success();
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -494,14 +494,12 @@
 
   // Create sparse environment and sparse matrix/dense vector handles.
   Type indexTp = rewriter.getIndexType();
-  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
   Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
   Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
-  auto env =
-      rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
-  Value handle = env.getResult(0);
+  auto env = rewriter.create<gpu::CreateSparseEnvOp>(loc, tokenTp, token,
+                                                     gpu::RtLibMode::CUSPARSE);
   token = env.getAsyncToken();
   Operation *spGenA =
       genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA,
@@ -509,11 +507,11 @@
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
   auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX);
+      loc, dnTensorHandleTp, tokenTp, token, vecX, szX);
   Value dnX = dvecX.getResult(0);
   token = dvecX.getAsyncToken();
   auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY);
+      loc, dnTensorHandleTp, tokenTp, token, vecY, szY);
   Value dnY = dvecY.getResult(0);
   token = dvecY.getAsyncToken();
 
@@ -521,7 +519,7 @@
 
   // Precompute buffersize for SpMV.
   auto bufferComp = rewriter.create<gpu::SpMVBufferSizeOp>(
-      loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY,
+      loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
       /*computeType=*/dnYType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
@@ -530,9 +528,8 @@
   token = buf.getAsyncToken();
 
   // Perform the SpMV.
-  auto spmvComp =
-      rewriter.create<gpu::SpMVOp>(loc, tokenTp, token, handle, spMatA, dnX,
-                                   dnY, /*computeType=*/dnYType, buffer);
+  auto spmvComp = rewriter.create<gpu::SpMVOp>(
+      loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer);
   token = spmvComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
@@ -542,7 +539,9 @@
               .getAsyncToken();
   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
+  token = rewriter
+              .create<gpu::DestroySparseEnvOp>(loc, tokenTp, token,
+                                               gpu::RtLibMode::CUSPARSE)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
@@ -601,14 +600,12 @@
 
   // Create sparse environment and sparse matrix/dense matrix handles.
   Type indexTp = rewriter.getIndexType();
-  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
   Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
-  auto env =
-      rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
-  Value handle = env.getResult(0);
+  auto env = rewriter.create<gpu::CreateSparseEnvOp>(loc, tokenTp, token,
+                                                     gpu::RtLibMode::CUSPARSE);
   token = env.getAsyncToken();
   Operation *spGenA =
       genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA,
@@ -616,12 +613,12 @@
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
   auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, handle, matB,
+      loc, dnTensorHandleTp, tokenTp, token, matB,
       SmallVector<Value>{szk, szn});
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
   auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, handle, matC,
+      loc, dnTensorHandleTp, tokenTp, token, matC,
       SmallVector<Value>{szm, szn});
   Value dnC = dmatC.getResult(0);
   token = dmatC.getAsyncToken();
@@ -630,7 +627,7 @@
 
   // Precompute buffersize for SpMM.
   auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(
-      loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC,
+      loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
       /*computeType=*/dmatCType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
@@ -641,9 +638,8 @@
   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
 
   // Perform the SpMM.
-  auto spmmComp =
-      rewriter.create<gpu::SpMMOp>(loc, tokenTp, token, handle, spMatA, dnB,
-                                   dnC, /*computeType=*/dnCType, buffer);
+  auto spmmComp = rewriter.create<gpu::SpMMOp>(
+      loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer);
   token = spmmComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
@@ -653,7 +649,9 @@
               .getAsyncToken();
   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
+  token = rewriter
+              .create<gpu::DestroySparseEnvOp>(loc, tokenTp, token,
+                                               gpu::RtLibMode::CUSPARSE)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
@@ -715,24 +713,20 @@
 
   // Create sparse environment and sparse matrix/dense matrix handles.
   Type indexTp = rewriter.getIndexType();
-  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
   Type dnMatHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
-  auto env =
-      rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
-  Value handle = env.getResult(0);
+  auto env = rewriter.create<gpu::CreateSparseEnvOp>(loc, tokenTp, token,
+                                                     gpu::RtLibMode::CUSPARSE);
   token = env.getAsyncToken();
 
   auto dmatA = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnMatHandleTp, tokenTp, token, handle, matA,
-      SmallVector<Value>{szm, szk});
+      loc, dnMatHandleTp, tokenTp, token, matA, SmallVector<Value>{szm, szk});
   Value dnA = dmatA.getResult(0);
   token = dmatA.getAsyncToken();
   auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnMatHandleTp, tokenTp, token, handle, matB,
-      SmallVector<Value>{szk, szn});
+      loc, dnMatHandleTp, tokenTp, token, matB, SmallVector<Value>{szk, szn});
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
 
@@ -745,7 +739,7 @@
   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
   // Precompute buffersize for SDDMM.
   auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(
-      loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType);
+      loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
   auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
@@ -753,8 +747,8 @@
   token = buf.getAsyncToken();
 
   // Perform the SDDMM.
-  auto sddmmComp = rewriter.create<gpu::SDDMMOp>(
-      loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer);
+  auto sddmmComp = rewriter.create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,
+                                                 spMatC, dnCType, buffer);
   token = sddmmComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
@@ -764,7 +758,9 @@
               .getAsyncToken();
   token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
+  token = rewriter
+              .create<gpu::DestroySparseEnvOp>(loc, tokenTp, token,
+                                               gpu::RtLibMode::CUSPARSE)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, buffer, token);
   token = genDeallocMemRef(rewriter, loc, matA, token);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -79,6 +79,49 @@
   ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
 };
 
+#ifdef MLIR_ENABLE_CUDA_CUSPARSE
+// Create the cusparse handles once for the duration of the instance
+class ScopedCuSparseHandleStorage {
+public:
+  static cusparseHandle_t env;
+  static bool initiated;
+  ScopedCuSparseHandleStorage() {
+    // Static reference to CUDA cuSparse environment handle
+    if (!initiated) {
+      CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&env));
+      initiated = true;
+    }
+  }
+
+  ~ScopedCuSparseHandleStorage() {}
+};
+
+cusparseHandle_t ScopedCuSparseHandleStorage::env = nullptr;
+bool ScopedCuSparseHandleStorage::initiated = false;
+
+#ifdef MLIR_ENABLE_CUDA_CUSPARSELT
+class ScopedCuSparseLtHandleStorage {
+public:
+  static cusparseLtHandle_t env;
+  static bool initiated;
+  ScopedCuSparseLtHandleStorage() {
+    // Static reference to CUDA cuSparseLt environment handle
+    if (!initiated) {
+      initiated = true;
+      // note that cuSparseLt still uses cusparseStatus_t
+      CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&env));
+    }
+  }
+
+  ~ScopedCuSparseLtHandleStorage() {}
+};
+
+cusparseLtHandle_t ScopedCuSparseLtHandleStorage::env;
+bool ScopedCuSparseLtHandleStorage::initiated = false;
+
+#endif // MLIR_ENABLE_CUDA_CUSPARSELT
+#endif // MLIR_ENABLE_CUDA_CUSPARSE
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
   ScopedContext scopedContext;
   CUmodule module = nullptr;
@@ -270,17 +313,17 @@
     (beta##p) = reinterpret_cast<void *>(&(beta##d));                          \
   }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateSparseEnv(CUstream /*stream*/) {
-  cusparseHandle_t handle = nullptr;
-  CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle))
-  return reinterpret_cast<void *>(handle);
+  ScopedCuSparseHandleStorage hstorage;
+  return;
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroySparseEnv(void *h, CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
-  CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle))
+mgpuDestroySparseEnv(CUstream /*stream*/) {
+  ScopedCuSparseHandleStorage hstorage;
+  CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(hstorage.env))
+  hstorage.initiated = false;
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
@@ -359,10 +402,10 @@
   CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp,
-                   CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize(
+    int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) {
+  ScopedCuSparseHandleStorage hstorage;
+
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
   cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
@@ -370,32 +413,33 @@
   cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
   size_t bufferSize = 0;
-  CUSPARSE_REPORT_IF_ERROR(
-      cusparseSpMV_bufferSize(handle, modeA, alphap, matA, vecX, betap, vecY,
-                              cTp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
+  CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
+      hstorage.env, modeA, alphap, matA, vecX, betap, vecY, cTp,
+      CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
   return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a,
-                                                   void *x, void *y,
-                                                   int32_t ctp, void *buf,
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x,
+                                                   void *y, int32_t ctp,
+                                                   void *buf,
                                                    CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+
+  ScopedCuSparseHandleStorage hstorage;
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
   cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
   cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
   cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
-  CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, alphap, matA, vecX,
+  CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(hstorage.env, modeA, alphap, matA, vecX,
                                         betap, vecY, cTp,
                                         CUSPARSE_SPMV_ALG_DEFAULT, buf))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
                    int32_t ctp, CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  ScopedCuSparseHandleStorage hstorage;
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
   cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
@@ -405,15 +449,16 @@
   ALPHABETA(cTp, alpha, beta)
   size_t bufferSize = 0;
   CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
-      handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
+      hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
       CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
   return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
-         int32_t ctp, void *buf, CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb,
+                                                   void *a, void *b, void *c,
+                                                   int32_t ctp, void *buf,
+                                                   CUstream /*stream*/) {
+  ScopedCuSparseHandleStorage hstorage;
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
   cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
@@ -421,16 +466,16 @@
   cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
   cudaDataType_t cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
-  CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA,
-                                        matB, betap, matC, cTp,
+  CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(hstorage.env, modeA, modeB, alphap,
+                                        matA, matB, betap, matC, cTp,
                                         CUSPARSE_SPMM_ALG_DEFAULT, buf))
 }
 
 // TODO: add support to passing alpha and beta as arguments
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
-mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c,
                     int32_t ctp, CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  ScopedCuSparseHandleStorage hstorage;
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
   cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
@@ -440,15 +485,16 @@
   ALPHABETA(cTp, alpha, beta)
   size_t bufferSize = 0;
   CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize(
-      handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
+      hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
       CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize))
   return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
-          int32_t ctp, void *buf, CUstream /*stream*/) {
-  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb,
+                                                    void *a, void *b, void *c,
+                                                    int32_t ctp, void *buf,
+                                                    CUstream /*stream*/) {
+  ScopedCuSparseHandleStorage hstorage;
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
   cusparseDnMatDescr_t matA = reinterpret_cast<cusparseDnMatDescr_t>(a);
@@ -456,8 +502,8 @@
   cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
   auto cTp = static_cast<cudaDataType_t>(ctp);
   ALPHABETA(cTp, alpha, beta)
-  CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA,
-                                         matB, betap, matC, cTp,
+  CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(hstorage.env, modeA, modeB, alphap,
+                                         matA, matB, betap, matC, cTp,
                                          CUSPARSE_SDDMM_ALG_DEFAULT, buf))
 }
 
@@ -488,29 +534,28 @@
 static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032);
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) {
-  // note that cuSparseLt still uses cusparseStatus_t
-  CUSPARSE_REPORT_IF_ERROR(
-      cusparseLtInit(reinterpret_cast<cusparseLtHandle_t *>(h)))
+mgpuCreateSparseLtEnv(CUstream /*stream*/) {
+  ScopedCuSparseLtHandleStorage hstorage;
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) {
-  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle))
+mgpuDestroySparseLtEnv(CUstream /*stream*/) {
+  ScopedCuSparseLtHandleStorage hstorage;
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&(hstorage.env)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols,
-                          void *values, int32_t dtp, CUstream /*stream*/) {
-  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values,
+                          int32_t dtp, CUstream /*stream*/) {
+  ScopedCuSparseLtHandleStorage hstorage;
+
   // CusparseLt expects the descriptors to be zero-initialized.
   memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData));
   auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
   auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
-      handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
+      &(hstorage.env), &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
       /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
   dnmat_handle->values = values;
 }
@@ -530,29 +575,29 @@
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols,
+mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols,
                               void *values, int32_t dtp, CUstream /*stream*/) {
   auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
   // CusparseLt expects the descriptors to be zero-initialized.
   memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData));
   spmat_handle->values = values;
-  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  ScopedCuSparseLtHandleStorage hstorage;
   auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
-      handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16,
-      dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+      &(hstorage.env), &(spmat_handle->mat), rows, cols, /*ld=*/cols,
+      /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW,
+      CUSPARSELT_SPARSITY_50_PERCENT))
 }
 
 // Several things are being done in this stage, algorithm selection, planning,
 // and returning workspace and compressed matrices data buffer sizes.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
-                             void *b, void *c, int32_t ctp,
-                             CUstream /*stream*/) {
+mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b,
+                             void *c, int32_t ctp, CUstream /*stream*/) {
   // TODO: support more advanced settings, e.g., the input right operand is a
   // sparse matrix assuming matA is the sparse matrix
-  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  ScopedCuSparseLtHandleStorage hstorage;
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
   auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
   auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
@@ -565,22 +610,25 @@
   cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
   cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
-      handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat),
-      &(matC->mat), &(matC->mat), cTp))
+      &(hstorage.env), &(matA->matmul), modeA, modeB, &(matA->mat),
+      &(matB->mat), &(matC->mat), &(matC->mat), cTp))
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
-      handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT))
+      &(hstorage.env), &(matA->alg_sel), &(matA->matmul),
+      CUSPARSELT_MATMUL_ALG_DEFAULT))
   int alg = 0;
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
-      handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+      &(hstorage.env), &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
       sizeof(alg)))
 
   CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
-      handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
+      &(hstorage.env), &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
 
-  CUSPARSE_REPORT_IF_ERROR(
-      cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace(
+      &(hstorage.env), &(matA->plan), &workspace_size_))
   CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
-      handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_))
+      &(hstorage.env), &(matA->plan), &compressed_size_,
+      &compressed_buffer_size_))
+
   // avoid zero-alloc
   *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_);
   *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_);
@@ -589,23 +637,23 @@
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace,
+mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace,
                    void *dA_compressed, void *dA_compressedBuffer,
                    CUstream stream) {
-  auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  ScopedCuSparseLtHandleStorage hstorage;
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
   auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
   auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
 
   ALPHABETA(CUDA_R_32F, alpha, beta)
   CUSPARSE_REPORT_IF_ERROR(
-      cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values),
+      cusparseLtSpMMACompress(&(hstorage.env), &(matA->plan), (matA->values),
                               dA_compressed, dA_compressedBuffer, stream))
 
   // TODO: add support to multi-stream execution
   // Perform the matrix multiplication. D = A*B+C using C==D for now
   CUSPARSE_REPORT_IF_ERROR(
-      cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed,
+      cusparseLtMatmul(&(hstorage.env), &(matA->plan), alphap, dA_compressed,
                        matB->values, betap, matC->values,
                        /*dD*/ matC->values, d_workspace, nullptr, 0))
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -20,14 +20,14 @@
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xf16>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
-    %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1:  memref<?xf16>
-    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf16>
-    %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16
-    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE_AND_CUSPARSE_LT
+    %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %arg0, %arg0, %mem1:  memref<?xf16>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4]  %mem2, %arg0, %arg0 : index, index into memref<?xf16>
+    %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16
+    %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE_AND_CUSPARSE_LT
     gpu.wait [%token10]
     return
   }
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -20,14 +20,14 @@
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
-    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec  into f64
-    %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
+    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref<?xf64>
+    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec  into f64
+    %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
@@ -50,14 +50,14 @@
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
-    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
-    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64
+    %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
@@ -80,14 +80,14 @@
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
-    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
-    %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64
+    %token7 = gpu.sddmm async [%token6]  %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -327,27 +327,27 @@
     // CHECK: gpu.alloc async
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
     // CHECK: gpu.create_sparse_env async
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     // CHECK: gpu.create_coo async
     %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
     // CHECK: gpu.create_csr async
     %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
     // CHECK: gpu.create_dn_tensor async
-    %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref<?xf64>
+    %dnvec, %token6 = gpu.create_dn_tensor async [%token5]  %mem2, %arg0 : index into memref<?xf64>
     // CHECK: gpu.spmv_buffer_size async
-    %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec  into f64
+    %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec  into f64
     // CHECK: gpu.spmv async
-    %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
+    %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
     // CHECK: gpu.create_dn_tensor async
-    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+    %dnmat, %token9 = gpu.create_dn_tensor async [%token8]  %mem2, %arg0, %arg0 : index, index into memref<?xf64>
     // CHECK: gpu.spmm_buffer_size async
-    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64
+    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64
     // CHECK: gpu.spmm async
-    %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>  into f64
+    %token11 = gpu.spmm async [%token10]  %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>  into f64
     // CHECK: gpu.sddmm_buffer_size async
-    %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat  into f64
+    %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat  into f64
     // CHECK: gpu.sddmm async
-    %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64>  into f64
+    %token13 = gpu.sddmm async [%token12]  %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64>  into f64
     // CHECK: gpu.destroy_dn_tensor async
     %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
     // CHECK: gpu.destroy_sp_mat async
@@ -355,7 +355,7 @@
     // CHECK: gpu.destroy_dn_tensor async
     %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
     // CHECK: gpu.destroy_sparse_env async
-    %token17 = gpu.destroy_sparse_env async [%token16] %env
+    %token17 = gpu.destroy_sparse_env async [%token16] CUSPARSE
     // CHECK: gpu.wait
     gpu.wait [%token17]
     return
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -6,28 +6,28 @@
   // CHECK: %{{.*}} = gpu.wait async
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
+  // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
-  // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}} : index into memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64
+  // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
   // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: gpu.wait [%{{.*}}]
   // CHECK: return
   func.func @matvec(%arg0: index) {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
-    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
-    %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
+    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref<?xf64>
+    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64
+    %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
@@ -36,28 +36,28 @@
   // CHECK: %{{.*}} = gpu.wait async
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
+  // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
-  // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64
+  // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
   // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: gpu.wait [%{{.*}}]
   // CHECK: return
   func.func @matmul(%arg0: index) {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
-    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
-    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64
+    %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
@@ -66,28 +66,28 @@
   // CHECK: %{{.*}} = gpu.wait async
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
   // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
+  // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}  into f64
-  // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>  into f64
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}  into f64
+  // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>  into f64
   // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
   // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE
   // CHECK: gpu.wait [%{{.*}}]
   // CHECK: return
   func.func @sddmm(%arg0: index) {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %token3 = gpu.create_sparse_env async [%token2] CUSPARSE
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
-    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
-    %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref<?xf64>
+    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64
+    %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
-    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE
     gpu.wait [%token10]
     return
   }
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -45,17 +45,17 @@
 // CHECK:           %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>
 // CHECK:           gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
 // CHECK:           %[[VAL_41:.*]] = gpu.wait async
-// CHECK:           %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
+// CHECK:           %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] CUSPARSE
 // CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
-// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
-// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
+// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
+// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
+// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
 // CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
-// CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
+// CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
 // CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]]
 // CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]]
-// CHECK:           %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
+// CHECK:           %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] CUSPARSE
 // CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
 // CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
 // CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -43,17 +43,17 @@
 // CHECK:           %[[VAL_37:.*]] = gpu.memcpy async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_32]] : memref<?xf64>, memref<?xf64>
 // CHECK:           gpu.wait {{\[}}%[[VAL_15]], %[[VAL_20]], %[[VAL_25]], %[[VAL_31]], %[[VAL_37]]]
 // CHECK:           %[[VAL_38:.*]] = gpu.wait async
-// CHECK:           %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]]
+// CHECK:           %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] CUSPARSE
 // CHECK:           %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
-// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
-// CHECK:           %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
+// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
+// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
+// CHECK:           %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
 // CHECK:           %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
-// CHECK:           %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
+// CHECK:           %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
 // CHECK:           %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]
 // CHECK:           %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]]
 // CHECK:           %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]]
-// CHECK:           %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
+// CHECK:           %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] CUSPARSE
 // CHECK:           %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
 // CHECK:           %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
 // CHECK:           %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref<?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -53,17 +53,17 @@
 // CHECK:           %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_18]] : memref<?xf64>, memref<?xf64>
 // CHECK:           gpu.wait {{\[}}%[[VAL_10]], %[[VAL_15]], %[[VAL_23]], %[[VAL_28]], %[[VAL_33]]]
 // CHECK:           %[[VAL_34:.*]] = gpu.wait async
-// CHECK:           %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]]
-// CHECK:           %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
-// CHECK:           %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_35]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
+// CHECK:           %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]] CUSPARSE
+// CHECK:           %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
+// CHECK:           %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64>
 // CHECK:           %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_csr async {{\[}}%[[VAL_40]]] %[[VAL_3]], %[[VAL_3]], %[[VAL_5]], %[[VAL_21]], %[[VAL_26]], %[[VAL_31]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64
+// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64
 // CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.alloc async {{\[}}%[[VAL_44]]] (%[[VAL_43]]) : memref<?xi8>
-// CHECK:           %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref<?xi8> into f64
+// CHECK:           %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref<?xi8> into f64
 // CHECK:           %[[VAL_48:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_37]]
 // CHECK:           %[[VAL_49:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_48]]] %[[VAL_39]]
 // CHECK:           %[[VAL_50:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_49]]] %[[VAL_41]]
-// CHECK:           %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] %[[VAL_35]]
+// CHECK:           %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] CUSPARSE
 // CHECK:           %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_51]]] %[[VAL_45]] : memref<?xi8>
 // CHECK:           %[[VAL_53:.*]] = gpu.dealloc async {{\[}}%[[VAL_52]]] %[[VAL_8]] : memref<8x8xf64>
 // CHECK:           %[[VAL_54:.*]] = gpu.dealloc async {{\[}}%[[VAL_53]]] %[[VAL_13]] : memref<8x8xf64>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -28,18 +28,18 @@
     %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16>
     %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16>
     %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16>
-    %env, %token7 = gpu.create_sparse_env async [%token6]
-    %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16>
-    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16>
-    %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16>
-    %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
+    %token7 = gpu.create_sparse_env async [%token6] CUSPARSE
+    %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %c16, %c32, %d_a: memref<16x32xf16>
+    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16>
+    %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16>
+    %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16
     %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref<?xf16>
     %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref<?xf16>
     %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref<?xf16>
-    %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
+    %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref<?xf16>, memref<?xf16>,memref<?xf16> into f16
     %token16 = gpu.destroy_sp_mat async [%token15] %spmat
     %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat
-    %token18 = gpu.destroy_sparse_env async [%token17] %env
+    %token18 = gpu.destroy_sparse_env async [%token17] CUSPARSE
     %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16>
     %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16>
     %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>