diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -1,6 +1,11 @@
 add_mlir_dialect(GPUOps gpu)
 add_mlir_doc(GPUOps -gen-op-doc GPUOps Dialects/)
 
+set(LLVM_TARGET_DEFINITIONS GPUBase.td)
+mlir_tablegen(GPUOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(GPUOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIRGPUOpInterfacesIncGen)
+
 set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
 mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
 mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
--- a/mlir/include/mlir/Dialect/GPU/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -53,4 +53,54 @@
   }];
 }
 
+def GPU_AsyncToken : DialectType<
+  GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token">,
+             BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">;
+
+def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> {
+  let description = [{
+    Interface for GPU operations that execute asynchronously on the device.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Query the operands that represent async dependency tokens.
+      }],
+      "OperandRange", "getAsyncDependencies", (ins), [{}], [{
+        ConcreteOp op = cast<ConcreteOp>(this->getOperation());
+        return op.asyncDependencies();
+      }]
+    >,
+    InterfaceMethod<[{
+        Set the operands that represent async dependency tokens.
+      }],
+      "void", "setAsyncDependencies", (ins "ValueRange":$asyncDependencies), [{}], [{
+        ConcreteOp op = cast<ConcreteOp>(this->getOperation());
+        auto asyncDepOdsIndex = 0; // Async dependencies always come first.
+        auto indexAndLength = op.getODSOperandIndexAndLength(asyncDepOdsIndex);
+        op.getOperation()->setOperands(indexAndLength.first, indexAndLength.second, asyncDependencies);
+        if constexpr (op.template hasTrait<OpTrait::AttrSizedOperandSegments>()) {
+          auto attrName = ConcreteOp::getOperandSegmentSizeAttr();
+          auto sizeAttr = op.template getAttrOfType<DenseIntElementsAttr>(attrName);
+          assert(sizeAttr && "operand segment size attribute not found");
+          SmallVector<int32_t, 8> sizes;
+          for (auto size : sizeAttr.getIntValues())
+            sizes.push_back(size.getSExtValue());
+          sizes[asyncDepOdsIndex] = asyncDependencies.size();
+          op.setAttr(attrName, Builder(op.getContext()).getI32VectorAttr(sizes));
+        }
+      }]
+    >,
+    InterfaceMethod<[{
+        Query the result that represents the async token to depend on.
+      }],
+      "OpResult", "getAsyncToken", (ins), [{}], [{
+        ConcreteOp op = cast<ConcreteOp>(this->getOperation());
+        return op.asyncToken().template cast<OpResult>();
+      }]
+    >,
+  ];
+}
+
 #endif // GPU_BASE
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
 #define MLIR_DIALECT_GPU_GPUDIALECT_H
 
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/FunctionSupport.h"
 #include "mlir/IR/OpDefinition.h"
@@ -34,13 +35,21 @@
   Value z;
 };
 
+class AsyncTokenType
+    : public Type::TypeBase<AsyncTokenType, Type, TypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+};
+
 } // end namespace gpu
 } // end namespace mlir
 
 #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc"
 
+#include "mlir/Dialect/GPU/GPUOpInterfaces.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.h.inc"
 
-
 #endif // MLIR_DIALECT_GPU_GPUDIALECT_H
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -249,7 +249,7 @@
       return getBody().getNumArguments() - getType().getNumInputs() -
           getNumWorkgroupAttributions();
     }
- 
+
     /// Returns a list of block arguments that correspond to buffers located in
     /// the private memory.
     ArrayRef<BlockArgument> getPrivateAttributions() {
@@ -301,7 +301,7 @@
                IntOrIndex:$blockSizeY, IntOrIndex:$blockSizeZ,
                Variadic<AnyType>:$operands)>,
     Results<(outs)> {
-  let summary = "Launches a function as a GPU kerneel";
+  let summary = "Launches a function as a GPU kernel";
 
   let description = [{
     Launch a kernel function on the specified grid of thread blocks.
@@ -756,4 +756,21 @@
   let verifier = [{ return success(); }];
 }
 
+def GPU_ForkOp : GPU_Op<"create_token", [GPU_AsyncOpInterface]> {
+  let summary = "Creates a GPU async token.";
+  let description = [{
+    This op creates a new async token from a list of async dependencies.
+
+    The op is inserted during lowering so that each async token is used only
+    once. This makes forks in async execution explicit.
+
+    The op does not imply any host synchronization.
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
+  let results = (outs GPU_AsyncToken:$asyncToken);
+
+  let assemblyFormat = "(`[` $asyncDependencies^ `]`)? attr-dict";
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -10,6 +10,7 @@
 
   DEPENDS
   MLIRGPUOpsIncGen
+  MLIRGPUOpInterfacesIncGen
   MLIRGPUPassIncGen
   MLIRParallelLoopMapperAttrGen
   MLIRParallelLoopMapperEnumsGen
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -16,12 +16,13 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::gpu;
@@ -36,12 +37,34 @@
 }
 
 void GPUDialect::initialize() {
+  addTypes<AsyncTokenType>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
       >();
 }
 
+Type GPUDialect::parseType(DialectAsmParser &parser) const {
+  // Parse the main keyword for the type.
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return Type();
+  MLIRContext *context = getContext();
+
+  // Handle 'async token' types.
+  if (keyword == "async token")
+    return AsyncTokenType::get(context);
+
+  parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
+  return Type();
+}
+
+void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
+  TypeSwitch<Type>(type)
+      .Case<AsyncTokenType>([&](Type) { os << "async token"; })
+      .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
+}
+
 LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
                                                    NamedAttribute attr) {
   if (!attr.second.isa<UnitAttr>() ||
@@ -283,22 +306,20 @@
 //   (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
 // where %size-* and %iter-* will correspond to the body region arguments.
 static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
-                                ValueRange operands, KernelDim3 ids) {
+                                KernelDim3 operands, KernelDim3 ids) {
   p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";
-  p << size.x << " = " << operands[0] << ", ";
-  p << size.y << " = " << operands[1] << ", ";
-  p << size.z << " = " << operands[2] << ')';
+  p << size.x << " = " << operands.x << ", ";
+  p << size.y << " = " << operands.y << ", ";
+  p << size.z << " = " << operands.z << ')';
 }
 
 static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
-  ValueRange operands = op.getOperands();
-
   // Print the launch configuration.
   p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
-  printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
+  printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(),
                       op.getBlockIds());
   p << ' ' << op.getThreadsKeyword();
-  printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
+  printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(),
                       op.getThreadIds());
 
   p.printRegion(op.body(), /*printEntryBlockArgs=*/false);
@@ -777,5 +798,7 @@
                 /*printBlockTerminators=*/false);
 }
 
+#include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -143,4 +143,15 @@
       "gpu.return"() : () -> ()
     } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref<?xf32>) -> (), workgroup_attributions = 1: i64} : () -> ()
   }
+
+  func @async() {
+    // CHECK-LABEL: func @async
+    // CHECK: %[[t0:.*]] = gpu.create_token
+    %0 = gpu.create_token
+    // CHECK: %[[t1:.*]] = gpu.create_token[%[[t0]]]
+    %1 = gpu.create_token[%0]
+    // CHECK: gpu.create_token[%[[t0]], %[[t1]]]
+    %2 = gpu.create_token[%0, %1]
+    return
+  }
 }