diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -16,8 +16,13 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
+/// a separate kernel function.
 std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass();
 
+/// Rewrites a function region so that GPU ops execute asynchronously.
+std::unique_ptr<OperationPass<FuncOp>> createGpuAsyncRegionPass();
+
 /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
 void populateGpuAllReducePatterns(MLIRContext *context,
                                   OwningRewritePatternList &patterns);
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td
--- a/mlir/include/mlir/Dialect/GPU/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@@ -16,4 +16,9 @@
   let constructor = "mlir::createGpuKernelOutliningPass()";
 }
 
+def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> {
+  let summary = "Make GPU ops async";
+  let constructor = "mlir::createGpuAsyncRegionPass()";
+}
+
 #endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRGPU
   IR/GPUDialect.cpp
   Transforms/AllReduceLowering.cpp
+  Transforms/AsyncRegionRewriter.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
   Transforms/ParallelLoopMapper.cpp
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -0,0 +1,138 @@
+//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU dialect pattern rewriters that make GPU op
+// within a region execute asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/GPU/Utils.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+using namespace mlir;
+namespace {
+class GpuAsyncRegionRewriter : public RewritePattern {
+  struct Callback;
+
+public:
+  explicit GpuAsyncRegionRewriter()
+      : RewritePattern(1, RewritePattern::MatchAnyOpTypeTag{}) {}
+
+private:
+  // Replaces synchronous GPU ops in the op's region with asynchronous ones and
+  // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
+  // execution semantics and that no GPU ops are asynchronous yet.
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override;
+};
+
+class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
+  void runOnFunction() override;
+};
+} // namespace
+
+struct GpuAsyncRegionRewriter::Callback {
+  // Region walk callback which turns all AsyncOpInterface ops into actual
+  // async and inserts a gpu.wait when it encounters a side-effecting or a
+  // terminator op.
+  WalkResult operator()(Operation *op) {
+    if (isa<gpu::LaunchOp>(op)) {
+      rewriter.notifyMatchFailure(op, "replace with gpu.launch_func first");
+      return WalkResult::interrupt();
+    }
+    if (isa<gpu::WaitOp>(op))
+      return WalkResult::advance();
+    rewriter.setInsertionPoint(op);
+    if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op)) {
+      // Replace GPU op with async version.
+      if (failed(rewriteAsyncOp(asyncOp)))
+        return WalkResult::interrupt();
+      matchResult = success();
+      return WalkResult::advance();
+    }
+    if (!token)
+      return WalkResult::advance();
+    if (!op->hasTrait<OpTrait::IsTerminator>() &&
+        MemoryEffectOpInterface::hasNoEffect(op))
+      return WalkResult::advance();
+    // Insert host synchronization before return or op with side effects.
+    token = createWaitOp(op->getLoc(), Type(), {token});
+    return WalkResult::advance();
+  }
+
+  LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
+    auto *op = asyncOp.getOperation();
+    if (asyncOp.getAsyncToken())
+      // TODO: Support ops that are already async.
+      return rewriter.notifyMatchFailure(op, "is already async");
+    if (op->getNumRegions() > 0)
+      return rewriter.notifyMatchFailure(op, "regions are not supported");
+
+    if (!token)
+      token = createWaitOp(op->getLoc(), tokenType, {});
+    asyncOp.addAsyncDependency(token);
+
+    SmallVector<Type, 1> resultTypes = {tokenType};
+    resultTypes.reserve(1 + op->getNumResults());
+    copy(op->getResultTypes(), std::back_inserter(resultTypes));
+
+    // Clone the asyncOp to add the token result.
+    auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
+                                    op->getOperands(), op->getMutableAttrDict(),
+                                    op->getSuccessors());
+    auto results = newOp->getResults();
+    token = results.front();
+    rewriter.insert(newOp);
+    rewriter.replaceOp(op, results.drop_front());
+    return success();
+  }
+
+  Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
+    return rewriter.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
+  }
+
+  PatternRewriter &rewriter;
+  LogicalResult &matchResult; // Set to success() if any op has been rewritten.
+  const Type tokenType = rewriter.getType<gpu::AsyncTokenType>();
+  Value token;
+};
+
+LogicalResult
+GpuAsyncRegionRewriter::matchAndRewrite(Operation *op,
+                                        PatternRewriter &rewriter) const {
+  if (op->getNumRegions() == 0)
+    return failure();
+
+  PatternRewriter::InsertionGuard guard(rewriter);
+  auto result = success();
+  for (auto &region : op->getRegions()) {
+    if (region.walk(Callback{rewriter, result}).wasInterrupted())
+      return failure(); // Note: this is a persistent failure, but there is not
+                        // error reporting for rewrite patterns.
+  }
+  return result;
+}
+
+void GpuAsyncRegionPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  patterns.insert<GpuAsyncRegionRewriter>();
+  applyOpPatternsAndFold(getFunction(), patterns);
+}
+
+std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
+  return std::make_unique<GpuAsyncRegionPass>();
+}
diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/async-region.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -gpu-async-region %s | FileCheck %s
+
+// CHECK: module attributes {gpu.container_module}
+module attributes {gpu.container_module} {
+
+  gpu.module @kernels {
+    gpu.func @kernel() kernel { gpu.return }
+  }
+
+  func @foo() -> ()
+
+  // CHECK-LABEL:func @async(%{{.*}}: index)
+  func @async(%sz : index) {
+    // CHECK: %[[t0:.*]] = gpu.wait async
+    // CHECK: %[[t1:.*]] = gpu.launch_func async [%[[t0]]]
+    gpu.launch_func @kernels::@kernel
+        blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+    // CHECK: %[[t2:.*]] = gpu.launch_func async [%[[t1]]]
+    gpu.launch_func @kernels::@kernel
+        blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+    // CHECK: gpu.wait [%[[t2]]]
+    // CHECK: call @foo
+    call @foo() : () -> ()
+    return
+  }
+
+}