diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -18,6 +18,7 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
@@ -202,6 +203,18 @@
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertAsyncYieldToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> {
+public:
+  ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime
 /// call. Currently it supports CUDA and ROCm (HIP).
 class ConvertWaitOpToGpuRuntimeCallPattern
@@ -429,11 +442,55 @@
   return success();
 }
 
-// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
-// streams (i.e. void*). The converted op synchronizes the host with every
-// stream and then destroys it. That is, it assumes that the stream is not used
-// afterwards. In case this isn't correct, we will get a runtime error.
-// Eventually, we will have a pass that guarantees this property.
+static bool isGpuAsyncTokenType(Value value) {
+  return value.getType().isa<gpu::AsyncTokenType>();
+}
+
+// Converts !gpu.async.token operands of `async.yield` to runtime calls. The
+// !gpu.async.token are lowered to stream within the async.execute region, but
+// are passed as events between them. For each !gpu.async.token operand, we
+// create an event and record it on the stream.
+LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
+    async::YieldOp yieldOp, ArrayRef<Value> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType))
+    return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand");
+
+  Location loc = yieldOp.getLoc();
+  SmallVector<Value, 4> newOperands(operands.begin(), operands.end());
+  llvm::SmallDenseSet<Value> streams;
+  for (auto &operand : yieldOp->getOpOperands()) {
+    if (!isGpuAsyncTokenType(operand.get()))
+      continue;
+    auto idx = operand.getOperandNumber();
+    auto stream = operands[idx];
+    auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
+    eventRecordCallBuilder.create(loc, rewriter, {event, stream});
+    newOperands[idx] = event;
+    streams.insert(stream);
+  }
+  for (auto stream : streams)
+    streamDestroyCallBuilder.create(loc, rewriter, {stream});
+
+  rewriter.updateRootInPlace(yieldOp,
+                             [&] { yieldOp->setOperands(newOperands); });
+  return success();
+}
+
+// Returns whether `value` which was converted from a !gpu.async.token is a
+// stream (instead of an event). We use streams within a region, and events
+// for block arguments and to pass across async.execute ops.
+static bool isTypeStream(Value value) {
+  assert(value.getType().isa<LLVM::LLVMPointerType>());
+  if (auto defOp = value.getDefiningOp<LLVM::CallOp>())
+    return defOp.callee()->equals("mgpuStreamCreate");
+  return false;
+}
+
+// Converts `gpu.wait` to runtime calls. The converted op synchronizes the host
+// with the stream/event operands. The operands are destroyed. That is, it
+// assumes that it is not used afterwards or elsewhere. Otherwise we will get a
+// runtime error. Eventually, we should guarantee this property.
 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::WaitOp waitOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
@@ -442,21 +499,25 @@
 
   Location loc = waitOp.getLoc();
 
-  for (auto asyncDependency : operands)
-    streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
-  for (auto asyncDependency : operands)
-    streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
+  for (auto operand : operands) {
+    if (isTypeStream(operand)) {
+      streamSynchronizeCallBuilder.create(loc, rewriter, {operand});
+      streamDestroyCallBuilder.create(loc, rewriter, {operand});
+    } else {
+      eventSynchronizeCallBuilder.create(loc, rewriter, {operand});
+      eventDestroyCallBuilder.create(loc, rewriter, {operand});
+    }
+  }
 
   rewriter.eraseOp(waitOp);
   return success();
 }
 
-// Converts `gpu.wait async` to runtime calls. The result is a new stream that
-// is synchronized with all operands, which are CUDA or ROCm streams (i.e.
-// void*). We create and record an event after the definition of the stream
-// and make the new stream wait on that event before destroying it again. This
-// assumes that there is no other use between the definition and this op, and
-// the plan is to have a pass that guarantees this property.
+// Converts `gpu.wait async` to runtime calls. The converted op creates a new
+// stream that is synchronized with stream/event operands. The operands are
+// destroyed. That is, it assumes that it is not used afterwards or elsewhere.
+// Otherwise we will get a runtime error. Eventually, we should guarantee this
+// property.
 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::WaitOp waitOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
@@ -468,18 +529,20 @@
   auto insertionPoint = rewriter.saveInsertionPoint();
   SmallVector<Value, 1> events;
   for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
-    auto token = std::get<0>(pair);
-    if (auto *defOp = token.getDefiningOp()) {
+    auto operand = std::get<1>(pair);
+    if (isTypeStream(operand)) {
+      // The operand is a stream. Insert an event into the stream just after its
+      // last use (i.e. after the token-defining op, which always exists).
+      auto *defOp = std::get<0>(pair).getDefiningOp();
       rewriter.setInsertionPointAfter(defOp);
+      auto event =
+          eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
+      eventRecordCallBuilder.create(loc, rewriter, {event, operand});
+      events.push_back(event);
     } else {
-      // If we can't find the defining op, we record the event at block start,
-      // which is late and therefore misses parallelism, but still valid.
-      rewriter.setInsertionPointToStart(waitOp->getBlock());
+      // The operand is an event.
+      events.push_back(operand);
     }
-    auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
-    auto stream = std::get<1>(pair);
-    eventRecordCallBuilder.create(loc, rewriter, {event, stream});
-    events.push_back(event);
   }
   rewriter.restoreInsertionPoint(insertionPoint);
   auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
@@ -729,7 +792,8 @@
                   ConvertHostRegisterOpToGpuRuntimeCallPattern,
                   ConvertMemcpyOpToGpuRuntimeCallPattern,
                   ConvertWaitAsyncOpToGpuRuntimeCallPattern,
-                  ConvertWaitOpToGpuRuntimeCallPattern>(converter);
+                  ConvertWaitOpToGpuRuntimeCallPattern,
+                  ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
   patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
       converter, gpuBinaryAnnotation);
   patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -30,6 +30,7 @@
 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
   struct ThreadTokenCallback;
   struct DeferWaitCallback;
+  struct SingleTokenUseCallback;
   void runOnFunction() override;
 };
 } // namespace
@@ -120,6 +121,44 @@
   Value currentToken = {};
 };
 
+/// Erases `executeOp` and returns a clone with additional `results`.
+async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
+                                   ValueRange results) {
+  // Add values to async.yield op.
+  Operation *yieldOp = executeOp.getBody()->getTerminator();
+  yieldOp->insertOperands(yieldOp->getNumOperands(), results);
+
+  // Construct new result type list with additional types.
+  SmallVector<Type, 2> resultTypes;
+  resultTypes.reserve(executeOp.getNumResults() + results.size());
+  transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
+            [](Type type) {
+              // Extract value type from !async.value.
+              if (auto valueType = type.dyn_cast<async::ValueType>())
+                return valueType.getValueType();
+              assert(type.isa<async::TokenType>() && "expected token type");
+              return type;
+            });
+  transform(results, std::back_inserter(resultTypes),
+            [](Value value) { return value.getType(); });
+
+  // Clone executeOp with the extra results.
+  OpBuilder builder(executeOp);
+  auto newOp = builder.create<async::ExecuteOp>(
+      executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
+      executeOp.dependencies(), executeOp.operands());
+  BlockAndValueMapping mapper;
+  newOp.getRegion().getBlocks().clear();
+  executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
+
+  // Replace executeOp with cloned one.
+  executeOp.getOperation()->replaceAllUsesWith(
+      newOp.getResults().drop_back(results.size()));
+  executeOp.erase();
+
+  return newOp;
+}
+
 // Callback for `async.execute` ops which tries to push the contained
 // synchronous `gpu.wait` op to the dependencies of the `async.execute`.
 struct GpuAsyncRegionPass::DeferWaitCallback {
@@ -146,69 +185,30 @@
     for (size_t i = 0; i < worklist.size(); ++i) {
       auto waitOp = worklist[i];
       auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
-      auto numDependencies = waitOp.asyncDependencies().size();
 
-      // Erase `gpu.wait` and return async dependencies from region instead.
-      auto &yieldOp = executeOp.getBody()->getOperations().back();
-      yieldOp.insertOperands(yieldOp.getNumOperands(),
-                             waitOp.asyncDependencies());
+      // Erase `gpu.wait` and return async dependencies from execute op instead.
+      SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
       waitOp.erase();
-      auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies);
+      executeOp = addExecuteResults(executeOp, dependencies);
 
       // Add the async dependency to each user of the `async.execute` token.
+      auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
       for (Operation *user : executeOp.token().getUsers())
         addAsyncDependencyAfter(asyncTokens, user);
     }
   }
 
 private:
-  // Append `count` `!async.value<!gpu.async.token>` results to `executeOp`.
-  static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp,
-                                         unsigned count) {
-    auto numResults = executeOp.getNumResults() + count;
-
-    // Construct new result type list with `count` additional types.
-    SmallVector<Type, 2> resultTypes;
-    resultTypes.reserve(numResults);
-    transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
-              [](Type type) {
-                // Extract value type from !async.value.
-                if (auto valueType = type.dyn_cast<async::ValueType>())
-                  return valueType.getValueType();
-                assert(type.isa<async::TokenType>() && "expected token type");
-                return type;
-              });
-    OpBuilder builder(executeOp);
-    auto tokenType = builder.getType<gpu::AsyncTokenType>();
-    resultTypes.resize(numResults, tokenType);
-
-    // Clone executeOp with the extra `!gpu.async.token` results.
-    auto newOp = builder.create<async::ExecuteOp>(
-        executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
-        executeOp.dependencies(), executeOp.operands());
-    BlockAndValueMapping mapper;
-    newOp.getRegion().getBlocks().clear();
-    executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
-
-    // Replace executeOp with cloned one.
-    executeOp.getOperation()->replaceAllUsesWith(
-        newOp.getResults().drop_back(count));
-    executeOp.erase();
-    executeOp = newOp;
-
-    // Return the new result values.
-    return executeOp.getResults().take_back(count);
-  }
-
   // Returns whether all token users are either 'async.execute' or 'async.await'
   // ops. This is used as a requirement for pushing 'gpu.wait' ops from a
   // 'async.execute' body to it's users. Specifically, we do not allow
   // terminator users, because it could mean that the `async.execute` is inside
   // control flow code.
   static bool areAllUsersExecuteOrAwait(Value token) {
-    return llvm::all_of(token.getUsers(), [](Operation *user) {
-      return isa<async::ExecuteOp, async::AwaitOp>(user);
-    });
+    return !token.use_empty() &&
+           llvm::all_of(token.getUsers(), [](Operation *user) {
+             return isa<async::ExecuteOp, async::AwaitOp>(user);
+           });
   }
 
   // Add the `asyncToken` as dependency as needed after `op`.
@@ -268,6 +268,46 @@
   SmallVector<gpu::WaitOp, 8> worklist;
 };
 
+// Callback for `async.execute` ops which repeats !gpu.async.token results
+// so that each of them is only used once.
+struct GpuAsyncRegionPass::SingleTokenUseCallback {
+  void operator()(async::ExecuteOp executeOp) {
+    // !gpu.async.token results which have multiple uses.
+    auto multiUseResults =
+        llvm::make_filter_range(executeOp.results(), [](OpResult result) {
+          if (result.use_empty() || result.hasOneUse())
+            return false;
+          auto valueType = result.getType().dyn_cast<async::ValueType>();
+          return valueType &&
+                 valueType.getValueType().isa<gpu::AsyncTokenType>();
+        });
+    if (multiUseResults.empty())
+      return;
+
+    // Index within !async.execute results (that is, without the async.token).
+    SmallVector<int, 4> indices;
+    transform(multiUseResults, std::back_inserter(indices),
+              [](OpResult result) {
+                return result.getResultNumber() - 1; // Index witout token.
+              });
+
+    for (auto index : indices) {
+      assert(!executeOp.results()[index].getUses().empty());
+      // Repeat async.yield token result, one for each use after the first one.
+      auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
+      auto count = std::distance(uses.begin(), uses.end());
+      auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
+      SmallVector<Value, 4> results(count, yieldOp.getOperand(index));
+      executeOp = addExecuteResults(executeOp, results);
+      // Update uses and results after replacing executeOp.
+      uses = llvm::drop_begin(executeOp.results()[index].getUses());
+      results = executeOp.results().take_back(count);
+      for (auto pair : llvm::zip(uses, results))
+        std::get<0>(pair).set(std::get<1>(pair));
+    }
+  }
+};
+
 // Replaces synchronous GPU ops in the op's region with asynchronous ones and
 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
 // execution semantics and that no GPU ops are asynchronous yet.
@@ -280,6 +320,8 @@
 
   // Collect gpu.wait ops that we can move out of async.execute regions.
   getFunction().getRegion().walk(DeferWaitCallback());
+  // Makes each !gpu.async.token returned from async.execute op have single use.
+  getFunction().getRegion().walk(SingleTokenUseCallback());
 }
 
 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
diff --git a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
@@ -12,8 +12,8 @@
     // CHECK: llvm.call @mgpuEventDestroy(%[[e0]])
     %t1 = gpu.wait async [%t0]
     // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]])
     // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
+    // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]])
     // CHECK: llvm.call @mgpuStreamDestroy(%[[t1]])
     gpu.wait [%t0, %t1]
     return
diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir
--- a/mlir/test/Dialect/GPU/async-region.mlir
+++ b/mlir/test/Dialect/GPU/async-region.mlir
@@ -125,4 +125,48 @@
     // CHECK: return %[[x]] : index
     return %x : index
   }
+
+  // CHECK-LABEL:func @async_execute_no_use(%{{.*}}: index)
+  func @async_execute_no_use(%sz : index) {
+    // CHECK: async.execute {
+    %a0 = async.execute {
+      // CHECK: %[[t:.*]] = gpu.launch_func async
+      gpu.launch_func @kernels::@kernel
+          blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+      // CHECK: gpu.wait [%[[t]]]
+      async.yield
+    }
+    return
+  }
+
+  // CHECK-LABEL:func @async_execute_fork(%{{.*}}: index)
+  func @async_execute_fork(%sz : index) {
+    // CHECK: %[[a0:.*]], %[[f0:.*]]:2 = async.execute
+    // CHECK-SAME: -> (!async.value<!gpu.async.token>, !async.value<!gpu.async.token>)
+    %a0 = async.execute {
+      // CHECK: %[[t:.*]] = gpu.launch_func async
+      gpu.launch_func @kernels::@kernel
+          blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+      // CHECK-NOT: gpu.wait
+      // CHECK: async.yield %[[t]], %[[t]] : !gpu.async.token, !gpu.async.token
+      async.yield
+    }
+    // CHECK: async.execute [%[[a0]]] (%[[f0]]#0 as {{.*}}: !async.value<!gpu.async.token>)
+    %a1 = async.execute [%a0] {
+      // CHECK: %[[t:.*]] = gpu.launch_func async
+      gpu.launch_func @kernels::@kernel
+          blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+      // CHECK: gpu.wait [%[[t]]]
+      async.yield
+    }
+    // CHECK: async.execute [%[[a0]]] (%[[f0]]#1 as {{.*}}: !async.value<!gpu.async.token>)
+    %a2 = async.execute [%a0] {
+      // CHECK: %[[t:.*]] = gpu.launch_func async
+      gpu.launch_func @kernels::@kernel
+          blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+      // CHECK: gpu.wait [%[[t]]]
+      async.yield
+    }
+    return
+  }
 }