diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -291,12 +291,14 @@
   let parser = [{ return parseGPUFuncOp(parser, result); }];
 }
 
-def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
-    Arguments<(ins SymbolRefAttr:$kernel,
+def GPU_LaunchFuncOp : GPU_Op<"launch_func",
+                              [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
+    Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+               SymbolRefAttr:$kernel,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
                Variadic<AnyType>:$operands)>,
-    Results<(outs)> {
+    Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "Launches a function as a GPU kernel";
 
   let description = [{
@@ -308,14 +310,22 @@
     function is required to be a gpu.module. And finally, the module containing
     the kernel module (which thus cannot be the top-level module) is required
     to have the `gpu.container_module` attribute. The `gpu.launch_func`
-    operation has a symbol attribute named `kernel` to identify the fully 
+    operation has a symbol attribute named `kernel` to identify the fully
     specified kernel function to launch (both the gpu.module and func).
 
-    The operation takes at least six operands, with the first three operands
-    being grid sizes along x,y,z dimensions and the following three being block
-    sizes along x,y,z dimensions. When a lower-dimensional kernel is required,
-    unused sizes must be explicitly set to `1`. The remaining operands are
-    passed as arguments to the kernel function.
+    The `gpu.launch_func` supports async dependencies: the kernel does not start
+    executing until the ops producing those async dependencies have completed.
+
+    By the default, the host implicitly blocks until kernel execution has
+    completed. If the `async` keyword is present, the host does not block but
+    instead a `!gpu.async.token` is returned. Other async GPU ops can take this
+    token as dependency.
+
+    The operation requires at least the grid and block sizes along the x,y,z
+    dimensions as arguments. When a lower-dimensional kernel is required,
+    unused sizes must be explicitly set to `1`.
+
+    The remaining operands are passed as arguments to the kernel function.
 
     Example:
 
@@ -351,11 +361,15 @@
         }
       }
 
+      %t0 = gpu.wait async
       gpu.launch_func
-          @kernels::@kernel_1                          // Kernel function.
-          blocks in (%cst, %cst, %cst)                 // Grid size.
-          threads in (%cst, %cst, %cst)                // Block size.
-          args(%arg0 : f32, %arg1 : memref<?xf32, 1>)  // Kernel arguments.
+          async                           // (Optional) Don't block host, return token.
+          [%t0]                           // (Optional) Execute only after %t0 has completed.
+          @kernels::@kernel_1             // Kernel function.
+          blocks in (%cst, %cst, %cst)    // Grid size.
+          threads in (%cst, %cst, %cst)   // Block size.
+          args(%arg0 : f32,               // (Optional) Kernel arguments.
+               %arg1 : memref<?xf32, 1>)
     }
     ```
   }];
@@ -402,6 +416,7 @@
 
   let verifier = [{ return ::verify(*this); }];
   let assemblyFormat = [{
+      custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
       $kernel
       `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
       `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -438,10 +438,15 @@
   auto kernelSymbol = builder.getSymbolRefAttr(
       kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())});
   result.addAttribute(getKernelAttrName(), kernelSymbol);
+  SmallVector<int32_t, 8> segmentSizes(8, 1);
+  segmentSizes.front() = 0; // Initially no async dependencies.
+  segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder.getI32VectorAttr(segmentSizes));
 }
 
 unsigned LaunchFuncOp::getNumKernelOperands() {
-  return getNumOperands() - kNumConfigOperands;
+  return getNumOperands() - asyncDependencies().size() - kNumConfigOperands;
 }
 
 StringRef LaunchFuncOp::getKernelModuleName() {
@@ -451,15 +456,17 @@
 StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); }
 
 Value LaunchFuncOp::getKernelOperand(unsigned i) {
-  return getOperation()->getOperand(i + kNumConfigOperands);
+  return getOperand(asyncDependencies().size() + kNumConfigOperands + i);
 }
 
 KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
-  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+  auto operands = getOperands().drop_front(asyncDependencies().size());
+  return KernelDim3{operands[0], operands[1], operands[2]};
 }
 
 KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
-  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+  auto operands = getOperands().drop_front(asyncDependencies().size());
+  return KernelDim3{operands[3], operands[4], operands[5]};
 }
 
 static LogicalResult verify(LaunchFuncOp op) {
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -20,9 +20,10 @@
   func @foo(%buffer: memref<?xf32>) {
     %c8 = constant 8 : index
     %c32 = constant 32 : i32
-    "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8, %c32, %buffer) {
-      kernel = @kernel_module::@kernel
-    } : (index, index, index, index, index, index, i32, memref<?xf32>) -> ()
+    gpu.launch_func @kernel_module::@kernel
+        blocks in (%c8, %c8, %c8)
+        threads in (%c8, %c8, %c8)
+        args(%c32 : i32, %buffer : memref<?xf32>)
     return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -3,7 +3,8 @@
 module attributes {gpu.container_module} {
   func @builtin() {
     %c0 = constant 1 : index
-    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_x} : (index, index, index, index, index, index) -> ()
+    gpu.launch_func @kernels::@builtin_workgroup_id_x
+        blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0)
     return
   }
 
@@ -26,7 +27,8 @@
 module attributes {gpu.container_module} {
   func @builtin() {
     %c0 = constant 1 : index
-    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_y} : (index, index, index, index, index, index) -> ()
+    gpu.launch_func @kernels::@builtin_workgroup_id_y
+        blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0)
     return
   }
 
@@ -49,7 +51,8 @@
 module attributes {gpu.container_module} {
   func @builtin() {
     %c0 = constant 1 : index
-    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_z} : (index, index, index, index, index, index) -> ()
+    gpu.launch_func @kernels::@builtin_workgroup_id_z
+        blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0)
     return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/if.mlir b/mlir/test/Conversion/GPUToSPIRV/if.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/if.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/if.mlir
@@ -7,7 +7,9 @@
 } {
   func @main(%arg0 : memref<10xf32>, %arg1 : i1) {
     %c0 = constant 1 : index
-    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = @kernels::@kernel_simple_selection} : (index, index, index, index, index, index, memref<10xf32>, i1) -> ()
+    gpu.launch_func @kernels::@kernel_simple_selection
+        blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0)
+        args(%arg0 : memref<10xf32>, %arg1 : i1)
     return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -15,7 +15,10 @@
     %1 = subi %c4, %c0_0 : index
     %c1_1 = constant 1 : index
     %c1_2 = constant 1 : index
-    "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = @kernels::@load_store_kernel} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
+    gpu.launch_func @kernels::@load_store_kernel
+        blocks in (%0, %c1_2, %c1_2) threads in (%1, %c1_2, %c1_2)
+        args(%arg0 : memref<12x4xf32>, %arg1 : memref<12x4xf32>, %arg2 : memref<12x4xf32>, 
+             %c0 : index, %c0_0 : index, %c1 : index, %c1_1 : index)
     return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -7,7 +7,9 @@
 } {
   func @loop(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) {
     %c0 = constant 1 : index
-    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = @kernels::@loop_kernel} : (index, index, index, index, index, index, memref<10xf32>, memref<10xf32>) -> ()
+    gpu.launch_func @kernels::@loop_kernel
+        blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0)
+        args(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>)
     return
   }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir b/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir
@@ -22,8 +22,9 @@
     %0 = "op"() : () -> (f32)
     %1 = "op"() : () -> (memref<12xf32, 11>)
     %cst = constant 1 : index
-    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@basic_module_structure }
-        : (index, index, index, index, index, index, f32, memref<12xf32, 11>) -> ()
+    gpu.launch_func @kernels::@basic_module_structure
+        blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+        args(%0 : f32, %1 : memref<12xf32, 11>)
     return
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
--- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@@ -18,8 +18,9 @@
     %0 = "op"() : () -> (f32)
     %1 = "op"() : () -> (memref<12xf32>)
     %cst = constant 1 : index
-    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@basic_module_structure }
-        : (index, index, index, index, index, index, f32, memref<12xf32>) -> ()
+    gpu.launch_func @kernels::@basic_module_structure
+        blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+        args(%0 : f32, %1 : memref<12xf32>)
     return
   }
 }
@@ -63,8 +64,9 @@
     %0 = "op"() : () -> (f32)
     %1 = "op"() : () -> (memref<12xf32>)
     %cst = constant 1 : index
-    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@missing_entry_point_abi }
-        : (index, index, index, index, index, index, f32, memref<12xf32>) -> ()
+    gpu.launch_func @kernels::@missing_entry_point_abi
+        blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+        args(%0 : f32, %1 : memref<12xf32>)
     return
   }
 }
diff --git a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
--- a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
+++ b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
@@ -26,7 +26,10 @@
   func @foo() {
     %0 = alloc() : memref<12xf32>
     %c1 = constant 1 : index
-    "gpu.launch_func"(%c1, %c1, %c1, %c1, %c1, %c1, %0) {kernel = @kernels::@kernel} : (index, index, index, index, index, index, memref<12xf32>) -> ()
+    gpu.launch_func @kernels::@kernel
+        blocks in(%c1, %c1, %c1)
+        threads in(%c1, %c1, %c1)
+        args(%0 : memref<12xf32>) -> ()
     return
   }
 }
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -37,6 +37,7 @@
 func @launch_func_too_few_operands(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands}}
   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
+      {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>}
       : (index, index, index, index, index) -> ()
   return
 }
@@ -55,6 +56,7 @@
   func @launch_func_missing_callee_attribute(%sz : index) {
     // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+        {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>}
         : (index, index, index, index, index, index) -> ()
     return
   }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -73,6 +73,7 @@
     %1 = "op"() : () -> (memref<?xf32, 1>)
     // CHECK: %{{.*}} = constant 8
     %cst = constant 8 : index
+    %t0 = gpu.wait async
 
     // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
     gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
@@ -80,6 +81,9 @@
     // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
     gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
 
+    // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
+    %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2  blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
+
     return
   }
 
diff --git a/mlir/test/mlir-vulkan-runner/addf.mlir b/mlir/test/mlir-vulkan-runner/addf.mlir
--- a/mlir/test/mlir-vulkan-runner/addf.mlir
+++ b/mlir/test/mlir-vulkan-runner/addf.mlir
@@ -37,8 +37,9 @@
 
     %cst1 = constant 1 : index
     %cst8 = constant 8 : index
-    "gpu.launch_func"(%cst8, %cst1, %cst1, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_add }
-        : (index, index, index, index, index, index, memref<8xf32>, memref<8xf32>, memref<8xf32>) -> ()
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
     %arg6 = memref_cast %arg5 : memref<?xf32> to memref<*xf32>
     call @print_memref_f32(%arg6) : (memref<*xf32>) -> ()
     return
diff --git a/mlir/test/mlir-vulkan-runner/addi.mlir b/mlir/test/mlir-vulkan-runner/addi.mlir
--- a/mlir/test/mlir-vulkan-runner/addi.mlir
+++ b/mlir/test/mlir-vulkan-runner/addi.mlir
@@ -36,8 +36,9 @@
 
     %cst1 = constant 1 : index
     %cst8 = constant 8 : index
-    "gpu.launch_func"(%cst8, %cst8, %cst8, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_addi }
-        : (index, index, index, index, index, index, memref<8xi32>, memref<8x8xi32>, memref<8x8x8xi32>) -> ()
+    gpu.launch_func @kernels::@kernel_addi
+        blocks in (%cst8, %cst8, %cst8) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xi32>, %arg1 : memref<8x8xi32>, %arg2 : memref<8x8x8xi32>)
     %arg6 = memref_cast %arg5 : memref<?x?x?xi32> to memref<*xi32>
     call @print_memref_i32(%arg6) : (memref<*xi32>) -> ()
     return
diff --git a/mlir/test/mlir-vulkan-runner/addi8.mlir b/mlir/test/mlir-vulkan-runner/addi8.mlir
--- a/mlir/test/mlir-vulkan-runner/addi8.mlir
+++ b/mlir/test/mlir-vulkan-runner/addi8.mlir
@@ -37,8 +37,9 @@
 
     %cst1 = constant 1 : index
     %cst8 = constant 8 : index
-    "gpu.launch_func"(%cst8, %cst8, %cst8, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_addi }
-        : (index, index, index, index, index, index, memref<8xi8>, memref<8x8xi8>, memref<8x8x8xi32>) -> ()
+    gpu.launch_func @kernels::@kernel_addi
+        blocks in (%cst8, %cst8, %cst8) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xi8>, %arg1 : memref<8x8xi8>, %arg2 : memref<8x8x8xi32>)
     %arg6 = memref_cast %arg5 : memref<?x?x?xi32> to memref<*xi32>
     call @print_memref_i32(%arg6) : (memref<*xi32>) -> ()
     return
diff --git a/mlir/test/mlir-vulkan-runner/mulf.mlir b/mlir/test/mlir-vulkan-runner/mulf.mlir
--- a/mlir/test/mlir-vulkan-runner/mulf.mlir
+++ b/mlir/test/mlir-vulkan-runner/mulf.mlir
@@ -38,8 +38,9 @@
 
     %cst1 = constant 1 : index
     %cst4 = constant 4 : index
-    "gpu.launch_func"(%cst4, %cst4, %cst1, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_mul }
-        : (index, index, index, index, index, index, memref<4x4xf32>, memref<4x4xf32>, memref<4x4xf32>) -> ()
+    gpu.launch_func @kernels::@kernel_mul
+        blocks in (%cst4, %cst4, %cst1) threads in(%cst1, %cst1, %cst1)
+        args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<4x4xf32>)
     %arg6 = memref_cast %arg5 : memref<?x?xf32> to memref<*xf32>
     call @print_memref_f32(%arg6) : (memref<*xf32>) -> ()
     return
diff --git a/mlir/test/mlir-vulkan-runner/subf.mlir b/mlir/test/mlir-vulkan-runner/subf.mlir
--- a/mlir/test/mlir-vulkan-runner/subf.mlir
+++ b/mlir/test/mlir-vulkan-runner/subf.mlir
@@ -40,8 +40,9 @@
     %cst1 = constant 1 : index
     %cst4 = constant 4 : index
     %cst8 = constant 8 : index
-    "gpu.launch_func"(%cst8, %cst4, %cst4, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_sub }
-        : (index, index, index, index, index, index, memref<8x4x4xf32>, memref<4x4xf32>, memref<8x4x4xf32>) -> ()
+    gpu.launch_func @kernels::@kernel_sub
+        blocks in (%cst8, %cst4, %cst4) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8x4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<8x4x4xf32>)
     %arg6 = memref_cast %arg5 : memref<?x?x?xf32> to memref<*xf32>
     call @print_memref_f32(%arg6) : (memref<*xf32>) -> ()
     return
diff --git a/mlir/test/mlir-vulkan-runner/time.mlir b/mlir/test/mlir-vulkan-runner/time.mlir
--- a/mlir/test/mlir-vulkan-runner/time.mlir
+++ b/mlir/test/mlir-vulkan-runner/time.mlir
@@ -44,8 +44,9 @@
 
     %cst1 = constant 1 : index
     %cst128 = constant 128 : index
-    "gpu.launch_func"(%cst128, %cst1, %cst1, %cst128, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_add }
-        : (index, index, index, index, index, index, memref<16384xf32>, memref<16384xf32>, memref<16384xf32>) -> ()
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%cst128, %cst1, %cst1) threads in (%cst128, %cst1, %cst1)
+        args(%arg0 : memref<16384xf32>, %arg1 : memref<16384xf32>, %arg2 : memref<16384xf32>)
     %arg6 = memref_cast %arg5 : memref<?xf32> to memref<*xf32>
     return
   }