diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -1,6 +1,11 @@
 add_mlir_dialect(GPUOps gpu)
 add_mlir_doc(GPUOps -gen-op-doc GPUOps Dialects/)
 
+set(LLVM_TARGET_DEFINITIONS GPUBase.td)
+mlir_tablegen(GPUOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(GPUOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRGPUOpInterfacesIncGen)
+
 set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
 mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
 mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
--- a/mlir/include/mlir/Dialect/GPU/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -53,4 +53,53 @@
   }];
 }
 
+def GPU_AsyncToken : DialectType<
+  GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token type">,
+             BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">;
+
+def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> {
+  let description = [{
+    Interface for GPU operations that execute asynchronously on the device.
+
+    GPU operations implementing this interface take a list of dependencies
+    as `gpu.async.token` arguments and optionally return a `gpu.async.token`.
+
+    The op doesn't start executing until all depent ops producing the async
+    dependency tokens have finished executing.
+
+    If the op returns a token, the op merely schedules the execution on the
+    device and returns immediately, without waiting for the execution to
+    complete. On the hand, if the op does not return a token, the op will wait
+    for the execution to complete.
+  }];
+  let cppNamespace = "::mlir::gpu";
+
+  let methods = [
+    InterfaceMethod<[{
+        Query the operands that represent async dependency tokens.
+      }],
+      "OperandRange", "getAsyncDependencies", (ins), [{}], [{
+        ConcreteOp op = cast<ConcreteOp>(this->getOperation());
+        return op.asyncDependencies();
+      }]
+    >,
+    InterfaceMethod<[{
+        Adds a new token to the list of async dependencies.
+      }],
+      "void", "addAsyncDependency", (ins "Value":$token),
+      [{}], [{
+        gpu::addAsyncDependency(this->getOperation(), token);
+      }]
+    >,
+    InterfaceMethod<[{
+        Query the result that represents the async token to depend on.
+      }],
+      "OpResult", "getAsyncToken", (ins), [{}], [{
+        ConcreteOp op = cast<ConcreteOp>(this->getOperation());
+        return op.asyncToken().template dyn_cast_or_null<OpResult>();
+      }]
+    >
+  ];
+}
+
 #endif // GPU_BASE
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
 #define MLIR_DIALECT_GPU_GPUDIALECT_H
 
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/FunctionSupport.h"
 #include "mlir/IR/OpDefinition.h"
@@ -34,13 +35,24 @@
   Value z;
 };
 
+class AsyncTokenType
+    : public Type::TypeBase<AsyncTokenType, Type, TypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+};
+
+// Adds a `gpu.async.token` to the front of the argument list.
+void addAsyncDependency(Operation *op, Value token);
+
 } // end namespace gpu
 } // end namespace mlir
 
 #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc"
 
+#include "mlir/Dialect/GPU/GPUOpInterfaces.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.h.inc"
 
-
 #endif // MLIR_DIALECT_GPU_GPUDIALECT_H
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -249,7 +249,7 @@
       return getBody().getNumArguments() - getType().getNumInputs() -
           getNumWorkgroupAttributions();
     }
- 
+
     /// Returns a list of block arguments that correspond to buffers located in
     /// the private memory.
     ArrayRef<BlockArgument> getPrivateAttributions() {
@@ -301,7 +301,7 @@
                IntOrIndex:$blockSizeY, IntOrIndex:$blockSizeZ,
                Variadic<AnyType>:$operands)>,
     Results<(outs)> {
-  let summary = "Launches a function as a GPU kerneel";
+  let summary = "Launches a function as a GPU kernel";
 
   let description = [{
     Launch a kernel function on the specified grid of thread blocks.
@@ -756,4 +756,47 @@
   let verifier = [{ return success(); }];
 }
 
+def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
+  let summary = "Wait for async gpu ops to complete.";
+  let description = [{
+    This op synchronizes the host or the device with a list of dependent ops.
+
+    If the op contains the `async` keyword, it returns a new async token which
+    is synchronized with the op arguments. This new token is merely a shortcut
+    to the argument list, and one could replace the uses of the result with the
+    arguments for the same effect. The async version of this op is primarily
+    used to make each async token have a single use during lowering and
+    thereby make forks in async execution explicit. Example usage:
+
+    ```mlir
+    %t0 = gpu.foo async : !gpu.async.token
+    %t1 = gpu.bar async : !gpu.async.token
+    %t2 = gpu.wait async [%t0, %t1]
+    // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just
+    // as if the async dependencies were [%t0, %t1].
+    %t3 = gpu.baz async [%t2]
+    ```
+
+    If the op does not contain the `async` keyword, it does not return a new
+    async token but blocks until all ops producing the async dependency tokens
+    finished execution. All dependent memory operations are visible to the host
+    once this op completes. Example usage:
+
+    ```mlir
+    %t0 = gpu.foo async : !gpu.async.token
+    %t1 = gpu.bar async : !gpu.async.token
+    // The gpu.wait op blocks until gpu.foo and gpu.bar have completed.
+    gpu.wait [%t0, %t1]
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncKeyword>(type($asyncToken))
+    (`[` $asyncDependencies^ `]`)? attr-dict
+  }];
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -10,6 +10,7 @@
 
   DEPENDS
   MLIRGPUOpsIncGen
+  MLIRGPUOpInterfacesIncGen
   MLIRGPUPassIncGen
   MLIRParallelLoopMapperAttrGen
   MLIRParallelLoopMapperEnumsGen
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -16,12 +16,13 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::gpu;
@@ -36,12 +37,34 @@
 }
 
 void GPUDialect::initialize() {
+  addTypes<AsyncTokenType>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
       >();
 }
 
+Type GPUDialect::parseType(DialectAsmParser &parser) const {
+  // Parse the main keyword for the type.
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return Type();
+  MLIRContext *context = getContext();
+
+  // Handle 'async token' types.
+  if (keyword == "async.token")
+    return AsyncTokenType::get(context);
+
+  parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
+  return Type();
+}
+
+void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
+  TypeSwitch<Type>(type)
+      .Case<AsyncTokenType>([&](Type) { os << "async.token"; })
+      .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
+}
+
 LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
                                                    NamedAttribute attr) {
   if (!attr.second.isa<UnitAttr>() ||
@@ -195,6 +218,26 @@
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// AsyncOpInterface
+//===----------------------------------------------------------------------===//
+
+void gpu::addAsyncDependency(Operation *op, Value token) {
+  op->insertOperands(0, {token});
+  if (!op->template hasTrait<OpTrait::AttrSizedOperandSegments>())
+    return;
+  auto attrName =
+      OpTrait::AttrSizedOperandSegments<void>::getOperandSegmentSizeAttr();
+  auto sizeAttr = op->template getAttrOfType<DenseIntElementsAttr>(attrName);
+  if (!sizeAttr)
+    return; // Async dependencies is the only variadic operand.
+  SmallVector<int32_t, 8> sizes;
+  for (auto size : sizeAttr.getIntValues())
+    sizes.push_back(size.getSExtValue());
+  ++sizes.front();
+  op->setAttr(attrName, Builder(op->getContext()).getI32VectorAttr(sizes));
+}
+
 //===----------------------------------------------------------------------===//
 // LaunchOp
 //===----------------------------------------------------------------------===//
@@ -283,22 +326,20 @@
 //   (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
 // where %size-* and %iter-* will correspond to the body region arguments.
 static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
-                                ValueRange operands, KernelDim3 ids) {
+                                KernelDim3 operands, KernelDim3 ids) {
   p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";
-  p << size.x << " = " << operands[0] << ", ";
-  p << size.y << " = " << operands[1] << ", ";
-  p << size.z << " = " << operands[2] << ')';
+  p << size.x << " = " << operands.x << ", ";
+  p << size.y << " = " << operands.y << ", ";
+  p << size.z << " = " << operands.z << ')';
 }
 
 static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
-  ValueRange operands = op.getOperands();
-
   // Print the launch configuration.
   p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
-  printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
+  printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(),
                       op.getBlockIds());
   p << ' ' << op.getThreadsKeyword();
-  printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
+  printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(),
                       op.getThreadIds());
 
   p.printRegion(op.body(), /*printEntryBlockArgs=*/false);
@@ -777,5 +818,23 @@
                 /*printBlockTerminators=*/false);
 }
 
+static ParseResult parseAsyncKeyword(OpAsmParser &parser,
+                                     Type &asyncTokenType) {
+  auto loc = parser.getCurrentLocation();
+  if (succeeded(parser.parseOptionalKeyword("async"))) {
+    if (parser.getNumResults() == 0)
+      return parser.emitError(loc, "needs to be named when marked 'async'");
+    asyncTokenType = parser.getBuilder().getType<AsyncTokenType>();
+  }
+  return success();
+}
+
+static void printAsyncKeyword(OpAsmPrinter &printer, Type asyncTokenType) {
+  if (asyncTokenType)
+    printer << "async ";
+}
+
+#include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -435,3 +435,17 @@
     } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 3: i64} : () -> ()
   }
 }
+
+// -----
+
+func @sync_wait_with_result() {
+  // expected-error @+1 {{cannot name an operation with no results}}
+  %t = gpu.wait
+}
+
+// -----
+
+func @async_wait_without_result() {
+  // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}}
+  gpu.wait async
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -143,4 +143,27 @@
       "gpu.return"() : () -> ()
     } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 1: i64} : () -> ()
   }
+
+  func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token {
+    // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token)
+    // CHECK: return {{.*}} : !gpu.async.token
+    return %arg0 : !gpu.async.token
+  }
+
+  func @async_wait() {
+    // CHECK-LABEL: func @async_wait
+    // CHECK: %[[t0:.*]] = gpu.wait async
+    %0 = gpu.wait async
+    // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]]
+    %1 = gpu.wait async [%0]
+    // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]]
+    %2 = gpu.wait async [%0, %1]
+    // CHECK: gpu.wait [%[[t0]], %[[t1]]]
+    // CHECK-NOT: async
+    gpu.wait [%0, %1]
+    // CHECK: gpu.wait
+    // CHECK-NOT: async
+    gpu.wait // Valid, but a no-op.
+    return
+  }
 }