diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -291,12 +291,14 @@
   let parser = [{ return parseGPUFuncOp(parser, result); }];
 }
 
-def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
-    Arguments<(ins SymbolRefAttr:$kernel,
+def GPU_LaunchFuncOp : GPU_Op<"launch_func",
+                              [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
+    Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+               SymbolRefAttr:$kernel,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
                Variadic<AnyType>:$operands)>,
-    Results<(outs)> {
+    Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "Launches a function as a GPU kernel";
 
   let description = [{
@@ -308,14 +310,22 @@
     function is required to be a gpu.module. And finally, the module containing
     the kernel module (which thus cannot be the top-level module) is required
     to have the `gpu.container_module` attribute. The `gpu.launch_func`
-    operation has a symbol attribute named `kernel` to identify the fully 
+    operation has a symbol attribute named `kernel` to identify the fully
     specified kernel function to launch (both the gpu.module and func).
 
-    The operation takes at least six operands, with the first three operands
-    being grid sizes along x,y,z dimensions and the following three being block
-    sizes along x,y,z dimensions. When a lower-dimensional kernel is required,
-    unused sizes must be explicitly set to `1`. The remaining operands are
-    passed as arguments to the kernel function.
+    The `gpu.launch_func` supports async dependencies: the kernel does not start
+    executing until the ops producing those async dependencies have completed.
+
+    By the default, the host implicitly blocks until kernel execution has
+    completed. If the `async` keyword is present, the host does not block but
+    instead a `!gpu.async.token` is returned. Other async GPU ops can take this
+    token as dependency.
+
+    The operation requires at least the grid and block sizes along the x,y,z
+    dimensions as arguments. When a lower-dimensional kernel is required,
+    unused sizes must be explicitly set to `1`.
+
+    The remaining operands are passed as arguments to the kernel function.
 
     Example:
 
@@ -351,11 +361,15 @@
         }
       }
 
-      gpu.launch_func
-          @kernels::@kernel_1                      // Kernel module and function.
-          grid in (%cst, %cst, %cst)               // Grid sizes.
-          block in (%cst, %cst, %cst)              // Block sizes.
-          (%arg0 : f32, %arg1 : memref<?xf32, 1>)  // Arguments passed to the kernel.
+      %t0 = gpu.wait async
+      %t1 = gpu.launch_func
+          async                        // (Optional) Don't block host, return token.
+          [%t0]                        // (Optional) Execute only after %t0 has completed.
+          @kernels::@kernel_1          // Kernel module and function.
+          grid in (%cst, %cst, %cst)   // Grid sizes.
+          block in (%cst, %cst, %cst)  // Block sizes.
+          (%arg0 : f32,                // Arguments passed to the kernel.
+           %arg1 : memref<?xf32, 1>)
     }
     ```
   }];
@@ -403,6 +417,7 @@
   let verifier = [{ return ::verify(*this); }];
   let assemblyFormat = [{
       $kernel
+      custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
       `grid` `in` custom<Space>() `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
       `block` `in` custom<Space>() `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
       custom<LaunchFuncOperands>($operands, type($operands))
@@ -748,4 +763,46 @@
   let verifier = [{ return success(); }];
 }
 
+def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
+  let summary = "Wait for async gpu ops to complete.";
+  let description = [{
+    This op synchronizes the host or the device with a list of dependent ops.
+
+    If the op contains the `async` keyword, it returns a new async token which
+    is synchronized with the op arguments. This new token is merely a shortcut
+    to the argument list, and one could replace the uses of the result with the
+    arguments for the same effect. The async version of this op is primarily
+    used to make each async token have a single use during lowering and
+    thereby make forks in async execution explicit. Example usage:
+
+    ```mlir
+    %t0 = gpu.foo async : !gpu.async.token
+    %t1 = gpu.bar async : !gpu.async.token
+    %t2 = gpu.wait async [%t0, %t1]
+    // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just
+    // as if the async dependencies were [%t0, %t1].
+    %t3 = gpu.baz async [%t2]
+    ```
+
+    If the op does not contain the `async` keyword, it does not return a new
+    async token but blocks until all ops producing the async dependency tokens
+    finished execution. All dependent memory operations are visible to the host
+    once this op completes. Example usage:
+
+    ```mlir
+    %t0 = gpu.foo async : !gpu.async.token
+    %t1 = gpu.bar async : !gpu.async.token
+    // The gpu.wait op blocks until gpu.foo and gpu.bar have completed.
+    gpu.wait [%t0, %t1]
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
+  }];
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -438,10 +438,15 @@
   auto kernelSymbol = builder.getSymbolRefAttr(
       kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())});
   result.addAttribute(getKernelAttrName(), kernelSymbol);
+  SmallVector<int32_t, 8> segmentSizes(8, 1);
+  segmentSizes.front() = 0; // Initially no async dependencies.
+  segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder.getI32VectorAttr(segmentSizes));
 }
 
 unsigned LaunchFuncOp::getNumKernelOperands() {
-  return getNumOperands() - kNumConfigOperands;
+  return getNumOperands() - asyncDependencies().size() - kNumConfigOperands;
 }
 
 StringRef LaunchFuncOp::getKernelModuleName() {
@@ -451,15 +456,17 @@
 StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); }
 
 Value LaunchFuncOp::getKernelOperand(unsigned i) {
-  return getOperation()->getOperand(i + kNumConfigOperands);
+  return getOperand(asyncDependencies().size() + kNumConfigOperands + i);
 }
 
 KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
-  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+  auto operands = getOperands().drop_front(asyncDependencies().size());
+  return KernelDim3{operands[0], operands[1], operands[2]};
 }
 
 KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
-  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+  auto operands = getOperands().drop_front(asyncDependencies().size());
+  return KernelDim3{operands[3], operands[4], operands[5]};
 }
 
 static LogicalResult verify(LaunchFuncOp op) {
@@ -840,6 +847,30 @@
                 /*printBlockTerminators=*/false);
 }
 
+static ParseResult parseAsyncDependencies(
+    OpAsmParser &parser, Type &asyncTokenType,
+    SmallVectorImpl<OpAsmParser::OperandType> &asyncDependencies) {
+  auto loc = parser.getCurrentLocation();
+  if (succeeded(parser.parseOptionalKeyword("async"))) {
+    if (parser.getNumResults() == 0)
+      return parser.emitError(loc, "needs to be named when marked 'async'");
+    asyncTokenType = parser.getBuilder().getType<AsyncTokenType>();
+  }
+  return parser.parseOperandList(asyncDependencies,
+                                 OpAsmParser::Delimiter::OptionalSquare);
+}
+
+static void printAsyncDependencies(OpAsmPrinter &printer, Type asyncTokenType,
+                                   OperandRange asyncDependencies) {
+  if (asyncTokenType)
+    printer << "async ";
+  if (asyncDependencies.empty())
+    return;
+  printer << "[";
+  llvm::interleaveComma(asyncDependencies, printer);
+  printer << "]";
+}
+
 #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc"
 
 #define GET_OP_CLASSES
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -21,6 +21,7 @@
     %c8 = constant 8 : index
     %c32 = constant 32 : i32
     "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8, %c32, %buffer) {
+      operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 2]> : vector<8xi32>,
       kernel = @kernel_module::@kernel
     } : (index, index, index, index, index, index, i32, memref<?xf32>) -> ()
     return
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -37,6 +37,7 @@
 func @launch_func_too_few_operands(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands}}
   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
+      {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>}
       : (index, index, index, index, index) -> ()
   return
 }
@@ -55,6 +56,7 @@
   func @launch_func_missing_callee_attribute(%sz : index) {
     // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+        {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>}
         : (index, index, index, index, index, index) -> ()
     return
   }
@@ -428,3 +430,17 @@
     } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 3: i64} : () -> ()
   }
 }
+
+// -----
+
+func @sync_wait_with_result() {
+  // expected-error @+1 {{cannot name an operation with no results}}
+  %t = gpu.wait
+}
+
+// -----
+
+func @async_wait_without_result() {
+  // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}}
+  gpu.wait async
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -73,6 +73,7 @@
     %1 = "op"() : () -> (memref<?xf32, 1>)
     // CHECK: %{{.*}} = constant 8
     %cst = constant 8 : index
+    %t0 = gpu.wait async
 
     // CHECK: gpu.launch_func @kernels::@kernel_1 grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
     gpu.launch_func @kernels::@kernel_1 grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref<?xf32, 1>)
@@ -80,6 +81,9 @@
     // CHECK: gpu.launch_func @kernels::@kernel_2 grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
     gpu.launch_func @kernels::@kernel_2 grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref<?xf32, 1>)
 
+    // CHECK: %{{.*}} = gpu.launch_func @kernels::@kernel_1 async [%{{.*}}] grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
+    %t1 = gpu.launch_func @kernels::@kernel_1  async [%t0] grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref<?xf32, 1>)
+
     return
   }
 
@@ -145,4 +149,21 @@
     // CHECK: return {{.*}} : !gpu.async.token
     return %arg0 : !gpu.async.token
   }
+
+  func @async_wait() {
+    // CHECK-LABEL: func @async_wait
+    // CHECK: %[[t0:.*]] = gpu.wait async
+    %0 = gpu.wait async
+    // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]]
+    %1 = gpu.wait async [%0]
+    // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]]
+    %2 = gpu.wait async [%0, %1]
+    // CHECK: gpu.wait [%[[t0]], %[[t1]]]
+    // CHECK-NOT: async
+    gpu.wait [%0, %1]
+    // CHECK: gpu.wait
+    // CHECK-NOT: async
+    gpu.wait // Valid, but a no-op.
+    return
+  }
 }