diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -54,6 +54,54 @@
     return
   }
 
+  // CHECK-LABEL: func @spgemm
+  // CHECK: llvm.call @mgpuStreamCreate
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuCreateCsr
+  // CHECK: llvm.call @mgpuCreateCsr
+  // CHECK: llvm.call @mgpuCreateCsr
+  // CHECK: llvm.call @mgpuSpGEMMCreateDescr
+  // CHECK: llvm.call @mgpuSpGEMMWorkEstimation
+  // CHECK: llvm.call @mgpuSpGEMMCompute
+  // CHECK: llvm.call @mgpuSpGEMMGetSize
+  // CHECK: llvm.call @mgpuSetCsrPointers
+  // CHECK: llvm.call @mgpuSpGEMMCopy
+  // CHECK: llvm.call @mgpuSpGEMMDestroyDescr
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuStreamSynchronize
+  // CHECK: llvm.call @mgpuStreamDestroy
+  func.func @spgemm(%arg0: index) {
+    %token0 = gpu.wait async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf32>
+    %spmatA, %token3 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %spmatB, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %spmatC, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %spgemmDesc, %token6 = gpu.spgemm_create_descr async [%token5]
+    %alloc = memref.alloc() : memref<0xi8>  // nullptr
+    %c0 = arith.constant 0 : index
+    %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async
+                            [%token6]{WORK_ESTIMATION}
+                            %spmatA, %spmatB, %spmatC,
+                            %spgemmDesc, %c0, %alloc: f32 into memref<0xi8>
+    %bufferSz2, %token8 = gpu.spgemm_work_estimation_or_compute async
+                               [%token7]{COMPUTE}
+                               %spmatA, %spmatB, %spmatC,
+                               %spgemmDesc, %c0, %alloc: f32 into memref<0xi8>
+    %rows, %cols, %nnz, %token9 = gpu.spgemm_get_size async [%token8] %spmatC
+    %token10 = gpu.set_csr_pointers async [%token8] %spmatC, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %token11 = gpu.spgemm_copy async [%token10] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
+    %token12 = gpu.spgemm_destroy_descr async [%token11] %spgemmDesc
+    %token13 = gpu.destroy_sp_mat async [%token12] %spmatA
+    %token14 = gpu.destroy_sp_mat async [%token13] %spmatB
+    %token15 = gpu.destroy_sp_mat async [%token14] %spmatC
+    gpu.wait [%token15]
+    return
+  }
+
   // CHECK-LABEL: func @sddmm
   // CHECK: llvm.call @mgpuStreamCreate
   // CHECK: llvm.call @mgpuMemAlloc
@@ -80,69 +128,4 @@
     return
   }
 
-
-  // CHECK-LABEL:     func @spgemm
-  // CHECK: llvm.call @mgpuStreamCreate
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuCreateCsr
-  // CHECK: llvm.call @mgpuCreateCsr
-  // CHECK: llvm.call @mgpuCreateCsr
-  // CHECK: llvm.call @mgpuSpGEMMCreateDescr
-  // CHECK: llvm.call @malloc
-  // CHECK: llvm.call @mgpuSpGEMMWorkEstimation
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuSpGEMMWorkEstimation
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuSpGEMMCompute
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuMemAlloc
-  // CHECK: llvm.call @mgpuStreamSynchronize
-  // CHECK: llvm.call @mgpuStreamDestroy
-  // CHECK: llvm.call @mgpuStreamCreate
-  // CHECK: llvm.call @mgpuSpGEMMCopy
-  // CHECK: llvm.call @mgpuDestroySpMat
-  // CHECK: llvm.call @mgpuDestroySpMat
-  // CHECK: llvm.call @mgpuDestroySpMat
-  // CHECK: llvm.call @mgpuStreamSynchronize
-  // CHECK: llvm.call @mgpuStreamDestroy
-  func.func @spgemm(%arg0: index) {
-    %token0 = gpu.wait async
-    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
-    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
-    %spmatA, %token3 = gpu.create_csr async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %spmatB, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %spmatC, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %spgemmDesc, %token6 = gpu.spgemm_create_descr async [%token5]
-    // Used as nullptr
-    %alloc = memref.alloc() : memref<0xi8>
-    %c0 = arith.constant 0 : index
-    %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async
-                            [%token6]{WORK_ESTIMATION}
-                            %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE},
-                            %spmatC, %spgemmDesc, %c0,
-                            %alloc: f32 into memref<0xi8>
-    %buf1, %token8 = gpu.alloc async [%token7] (%bufferSz1) : memref<?xi8>
-    %bufferSz1_1, %token9 = gpu.spgemm_work_estimation_or_compute async
-                              [%token8]{WORK_ESTIMATION} %spmatA, %spmatB,
-                              %spmatC, %spgemmDesc, %bufferSz1,
-                              %buf1: f32 into memref<?xi8>
-    %buf2, %token13 = gpu.alloc async [%token9] (%bufferSz1_1) : memref<?xi8>
-    %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async
-                               [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC,
-                               %spgemmDesc, %bufferSz1_1,
-                               %buf2: f32 into memref<?xi8>
-    %rows, %cols, %nnz, %token15 = gpu.spgemm_get_size async [%token14] %spmatC
-    %mem_columns, %token16 = gpu.alloc async [%token15] (%cols) : memref<?xi32>
-    %mem_values, %token17 = gpu.alloc async [%token16] (%nnz) : memref<?xf32>
-    gpu.wait [%token17]
-    %token18 = gpu.wait async
-    %token19 = gpu.spgemm_copy async [%token18] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
-    %token20 = gpu.destroy_sp_mat async [%token19] %spmatA
-    %token21 = gpu.destroy_sp_mat async [%token20] %spmatB
-    %token22 = gpu.destroy_sp_mat async [%token21] %spmatC
-    gpu.wait [%token22]
-    return
-  }
-
 }
diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -54,24 +54,25 @@
     return
   }
 
-  // CHECK-LABEL:     func @spgemm
-  // CHECK:           %{{.*}} = gpu.wait async
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf32>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_create_descr async [%{{.*}}]
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ WORK_ESTIMATION} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
-  // CHECK:           %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ COMPUTE} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
-  // CHECK:           %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_get_size async [%{{.*}}] %{{.*}}
-  // CHECK            %{{.*}} = gpu.set_csr_pointers async [%{{.*}}] %{{.*}}, {{.*}}, {{.*}}, {{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
-  // CHECK:           %{{.*}} = gpu.spgemm_copy async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
-  // CHECK:           %{{.*}} = gpu.spgemm_destroy_descr async [%{{.*}}] %{{.*}}
-  // CHECK:           gpu.destroy_sp_mat %{{.*}}
-  // CHECK:           gpu.destroy_sp_mat %{{.*}}
-  // CHECK:           gpu.destroy_sp_mat %{{.*}}
-  // CHECK:           return
+  // CHECK-LABEL: func @spgemm
+  // CHECK: %{{.*}} = gpu.wait async
+  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xindex>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf32>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_create_descr async [%{{.*}}]
+  // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ WORK_ESTIMATION} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ COMPUTE} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8>
+  // CHECK: %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_get_size async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.set_csr_pointers async [%{{.*}}] %{{.*}}, {{.*}}, {{.*}}, {{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf32>
+  // CHECK: %{{.*}} = gpu.spgemm_copy async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
+  // CHECK: %{{.*}} = gpu.spgemm_destroy_descr async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
+  // CHECK: gpu.wait [%{{.*}}]
+  // CHECK: return
   func.func @spgemm(%arg0: index) {
     %token0 = gpu.wait async
     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
@@ -94,9 +95,10 @@
     %token10 = gpu.set_csr_pointers async [%token8] %spmatC, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf32>
     %token11 = gpu.spgemm_copy async [%token10] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32
     %token12 = gpu.spgemm_destroy_descr async [%token11] %spgemmDesc
-    gpu.destroy_sp_mat %spmatA
-    gpu.destroy_sp_mat %spmatB
-    gpu.destroy_sp_mat %spmatC
+    %token13 = gpu.destroy_sp_mat async [%token12] %spmatA
+    %token14 = gpu.destroy_sp_mat async [%token13] %spmatB
+    %token15 = gpu.destroy_sp_mat async [%token14] %spmatC
+    gpu.wait [%token15]
     return
   }