diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -639,6 +639,7 @@
               .getAsyncToken();
   token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
               .getAsyncToken();
+  token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
     token = genDeallocMemRef(rewriter, loc, colA, token);
   token = genDeallocMemRef(rewriter, loc, valA, token);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -79,11 +79,11 @@
   ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
 };
 
+#ifdef MLIR_ENABLE_CUDA_CUSPARSE
 // Note that (1) Nvidia confirms the safety to share handle across multiple
 // instances, and streams. (2) Clients are responsible to call the @mgpu
 // environment initialization/destruction in a thread-safe manner, e.g.,
 // at the beginning of the program before multi-threads are created.
-#ifdef MLIR_ENABLE_CUDA_CUSPARSE
 static cusparseHandle_t cusparse_env = nullptr;
 
 #ifdef MLIR_ENABLE_CUDA_CUSPARSELT
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -54,7 +54,8 @@
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
 // CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]]
 // CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]]
-// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_19]] : memref<?xindex>
+// CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_14]] : memref<?xindex>
+// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
 // CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
 // CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2622,6 +2622,9 @@
         "include/mlir/Dialect/SparseTensor/Transforms/Passes.h",
     ],
     includes = ["include"],
+    # Here:
+    #   CUSPARSE_COO_AOS : enables AOS COO
+    local_defines = ["CUSPARSE_COO_AOS"],
     deps = [
         ":AffineDialect",
         ":ArithDialect",
@@ -8095,12 +8098,18 @@
 cc_library(
     name = "_mlir_cuda_runtime",
     srcs = ["lib/ExecutionEngine/CudaRuntimeWrappers.cpp"],
+    #compatible_with = ["//buildenv/target:prod"],
     # Prevent needing EnableABIBreakingChecks symbol from LLVMSupport.
     copts = ["-DLLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING=1"],
     # Here:
     #   MLIR_ENABLE_CUDA_CUSPARSE   : enables cuSPARSE
     #   MLIR_ENABLE_CUDA_CUSPARSELT : enables cuSPARSElt
-    local_defines = ["MLIR_ENABLE_CUDA_CUSPARSE"],
+    #   CUSPARSE_COO_AOS            : enables AOS COO
+    local_defines = [
+        "MLIR_ENABLE_CUDA_CUSPARSE",
+        #     "MLIR_ENABLE_CUDA_CUSPARSELT",
+        "CUSPARSE_COO_AOS",
+    ],
     tags = [
         "manual",  # External dependency
         "nobuildkite",  # TODO(gcmn): Add support for this target