diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --pre-sparsification-rewrite \
+// RUN:             --sparsification="parallelization-strategy=dense-outer-loop" \
+// RUN:             --sparse-gpu-codegen | FileCheck %s
+
+#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>
+
+//
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK-DAG:   gpu.func @kernel0
+// CHECK-DAG:   gpu.func @kernel1
+//
+// CHECK-LABEL: func.func @matmuls
+// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel0 blocks
+// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel1 blocks
+//
+func.func @matmuls(%A: tensor<1024x8xf64>,
+                   %B: tensor<8x1024xf64, #CSR>,
+		   %C: tensor<1024x1024xf64, #CSR>) -> tensor<1024x1024xf64> {
+  %Z = arith.constant dense<0.0> : tensor<1024x1024xf64>
+  %T = linalg.matmul
+      ins(%A, %B: tensor<1024x8xf64>, tensor<8x1024xf64, #CSR>)
+      outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
+  %D = linalg.matmul
+      ins(%T, %C: tensor<1024x1024xf64>, tensor<1024x1024xf64, #CSR>)
+      outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
+  return %D : tensor<1024x1024xf64>
+}
+
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
@@ -8,7 +8,8 @@
 //
 // Compute matrix matrix C = AB
 //
-// CHECK-LABEL: gpu.func @kernel(
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK-LABEL: gpu.func @kernel0(
 // CHECK-SAME:        %[[VAL_0:.*0]]: index,
 // CHECK-SAME:        %[[VAL_1:.*1]]: index,
 // CHECK-SAME:        %[[VAL_2:.*2]]: memref<?xindex>,
@@ -51,7 +52,7 @@
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func  @sparsekernels::@kernel blocks
+// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
 //
 func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
   %C_out = linalg.matmul
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
@@ -8,8 +8,8 @@
 //
 // Compute matrix vector y = Ax
 //
-//
-// CHECK:       gpu.func @kernel(
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK:       gpu.func @kernel0(
 // CHECK-SAME:          %[[VAL_0:.*0]]: index,
 // CHECK-SAME:          %[[VAL_1:.*1]]: memref<?xf64>,
 // CHECK-SAME:          %[[VAL_2:.*2]]: memref<?xindex>,
@@ -48,7 +48,7 @@
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func  @sparsekernels::@kernel blocks
+// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
 //
 func.func @matvec(%A: tensor<?x?xf64, #CSR>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
   %y_out = linalg.matvec
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7627,11 +7627,11 @@
     name = "libmlir_cuda_runtime.so",
     linkshared = True,
     linkstatic = False,
+    deps = [":mlir_cuda_runtime"],
     tags = [
         "manual",  # External dependency
         "nobuildkite",  # TODO(gcmn): Add support for this target
     ],
-    deps = [":mlir_cuda_runtime"],
 )
 
 cc_library(