diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
--- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -392,7 +392,7 @@
   LLVM::LLVMType getVoidType() const;
 
   /// Get the MLIR type wrapping the LLVM i8* type.
-  LLVM::LLVMType getVoidPtrType() const;
+  LLVM::LLVMType getVoidPtrType(unsigned addrspace = 0) const;
 
   /// Create an LLVM dialect operation defining the given index constant.
   Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -18,7 +18,9 @@
 #include "mlir/IR/FunctionSupport.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Interfaces/SideEffects.h"
 
 namespace mlir {
@@ -34,6 +36,24 @@
   Value z;
 };
 
+enum GpuTypes {
+  Chain = Type::FIRST_GPU_TYPE,
+  LAST_USED_GPU_TYPE = Chain,
+};
+
+class ChainType : public Type::TypeBase<ChainType, Type> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+  /// Construction hook.
+  static ChainType get(MLIRContext *context) {
+    /// Custom, uniq'ed construction in the MLIRContext.
+    return Base::get(context, GpuTypes::Chain);
+  }
+  /// Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == GpuTypes::Chain; }
+};
+
 #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc"
 
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -23,6 +23,10 @@
   Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>,
   "integer, index or LLVM dialect equivalent">;
 
+def GPU_ChainType : DialectType<
+  GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::ChainType>()">, "chain">,
+             BuildableType<"mlir::gpu::ChainType::get($_builder.getContext())">;
+
 //===----------------------------------------------------------------------===//
 // GPU Dialect operations.
 //===----------------------------------------------------------------------===//
@@ -245,13 +249,13 @@
   let parser = [{ return parseGPUFuncOp(parser, result); }];
 }
 
-def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
-    Arguments<(ins IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY,
+def GPU_LaunchFuncOp : GPU_Op<"launch_func", [AttrSizedOperandSegments]>,
+    Arguments<(ins Variadic<GPU_ChainType>:$chains, IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY,
                IntLikeOrLLVMInt:$gridSizeZ, IntLikeOrLLVMInt:$blockSizeX,
                IntLikeOrLLVMInt:$blockSizeY, IntLikeOrLLVMInt:$blockSizeZ,
                Variadic<AnyType>:$operands)>,
-    Results<(outs)> {
-  let summary = "Launches a function as a GPU kerneel";
+    Results<(outs GPU_ChainType)> {
+  let summary = "Launches a function as a GPU kernel";
 
   let description = [{
     Launch a kernel function on the specified grid of thread blocks.
@@ -323,11 +327,11 @@
   let skipDefaultBuilders = 1;
 
   let builders = [
-    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
+    OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, "
               "Value gridSizeX, Value gridSizeY, Value gridSizeZ, "
               "Value blockSizeX, Value blockSizeY, Value blockSizeZ, "
               "ValueRange kernelOperands">,
-    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
+    OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, "
               "KernelDim3 gridSize, KernelDim3 blockSize, "
               "ValueRange kernelOperands">
   ];
@@ -352,6 +356,8 @@
     /// Get the SSA values passed as operands to specify the block size.
     KernelDim3 getBlockSizeOperandValues();
 
+    OperandRange getChains();
+
     /// The number of launch configuration operands, placed at the leading
     /// positions of the operand list.
     static constexpr unsigned kNumConfigOperands = 6;
@@ -373,9 +379,11 @@
 }
 
 def GPU_LaunchOp : GPU_Op<"launch">,
-    Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
+    Arguments<(ins Variadic<GPU_ChainType>:$chains,
+               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>,
-    Results<(outs)> {
+    // Variadic (0 or 1) to be backwards compatible.
+    Results<(outs Variadic<GPU_ChainType>)> {
   let summary = "GPU kernel launch operation";
 
   let description = [{
@@ -446,7 +454,10 @@
   let builders = [
     OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX,"
               "Value gridSizeY, Value gridSizeZ, Value blockSizeX,"
-              "Value blockSizeY, Value blockSizeZ">
+              "Value blockSizeY, Value blockSizeZ">,
+    OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, "
+              "Value gridSizeX, Value gridSizeY, Value gridSizeZ, "
+              "Value blockSizeX, Value blockSizeY, Value blockSizeZ">
   ];
 
   let extraClassDeclaration = [{
@@ -464,6 +475,8 @@
     /// Get the SSA values passed as operands to specify the block size.
     KernelDim3 getBlockSizeOperandValues();
 
+    OperandRange getChains();
+
     static StringRef getBlocksKeyword() { return "blocks"; }
     static StringRef getThreadsKeyword() { return "threads"; }
 
@@ -641,6 +654,46 @@
   let printer = [{ p << getOperationName(); }];
 }
 
+def GPU_MemcpyOp : GPU_Op<"memcpy"> {
+  let summary = "GPU memory copy operation";
+  let description = [{
+    The `memcpy` operation copies a region of memory from GPU to host or vice
+    versa.
+  }];
+
+  // TODO(csigg): src and dst should be AnyUnrankedMemRef.
+  let arguments = (ins Variadic<GPU_ChainType>:$chains, AnyType:$dst,
+                       AnyType:$src, OptionalAttr<I64Attr>:$element_size);
+  let results = (outs GPU_ChainType);
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, ValueRange chains, Value dst, " #
+    "Value src", [{
+       result.addOperands(chains);
+       result.addOperands({dst, src});
+       result.types.push_back(ChainType::get(builder->getContext()));
+     }]>];
+
+  let extraClassDeclaration = [{
+  }];
+
+  let assemblyFormat = [{
+    (`[` $chains^ `]`)? `(` $dst`,` $src `)` attr-dict `:` type($dst)`,` type($src)
+  }];
+}
+
+def GPU_WaitOp : GPU_Op<"wait"> {
+  let summary = "GPU async wait operation";
+  let description = [{
+    The `wait` operation blocks on a list of chains.
+  }];
+
+  let arguments = (ins Variadic<GPU_ChainType>:$chains);
+  let results = (outs);
+
+  let assemblyFormat = "`[` $chains `]` attr-dict";
+}
+
 def GPU_GPUModuleOp : GPU_Op<"module", [
   IsolatedFromAbove, SymbolTable, Symbol,
   SingleBlockImplicitTerminator<"ModuleEndOp">
diff --git a/mlir/include/mlir/IR/DialectSymbolRegistry.def b/mlir/include/mlir/IR/DialectSymbolRegistry.def
--- a/mlir/include/mlir/IR/DialectSymbolRegistry.def
+++ b/mlir/include/mlir/IR/DialectSymbolRegistry.def
@@ -25,6 +25,7 @@
 DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect
 DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect
 DEFINE_SYM_KIND_RANGE(SHAPE) // Shape dialect
+DEFINE_SYM_KIND_RANGE(GPU) // GPU dialect
 
 // The following ranges are reserved for experimenting with MLIR dialects in a
 // private context without having to register them here.
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Mutex.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
@@ -57,6 +58,11 @@
   void runOnOperation() override {
     gpu::GPUModuleOp module = getOperation();
 
+    auto llvmDialect =
+        module.getContext()->getRegisteredDialect<mlir::LLVM::LLVMDialect>();
+    llvm::sys::SmartScopedLock<true> scopedLock(
+        llvmDialect->getLLVMContextMutex());
+
     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
     LLVMInitializeNVPTXTargetInfo();
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -41,6 +41,10 @@
 static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
 static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
 static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
+static constexpr const char *cuGraphAddKernelNodeName = "mcuGraphAddKernelNode";
+static constexpr const char *cuGraphAddMemcpyNodeName = "mcuGraphAddMemcpyNode";
+static constexpr const char *cuGraphExecuteName = "mcuGraphExecute";
+static constexpr const char *cuGetGraphHelperName = "mcuGetGraphHelper";
 
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";
 static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
@@ -79,6 +83,17 @@
     llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
     llvmIntPtrType = LLVM::LLVMType::getIntNTy(
         llvmDialect, module.getDataLayout().getPointerSizeInBits());
+    llvmKernelNodeParamsType = LLVM::LLVMType::getStructTy(
+        llvmDialect,
+        {
+            getPointerType(),                               // CUfunction
+            getInt32Type(), getInt32Type(), getInt32Type(), // gridDim
+            getInt32Type(), getInt32Type(), getInt32Type(), // blockDim
+            getInt32Type(),                                 // sharedMemBytes
+            getPointerPointerType(),                        // kernelParams
+            getPointerPointerType()                         // extras
+        },
+        false);
   }
 
   LLVM::LLVMType getVoidType() { return llvmVoidType; }
@@ -104,6 +119,8 @@
     return getInt32Type();
   }
 
+  LLVM::LLVMType getKernelNodeParamsType() { return llvmKernelNodeParamsType; }
+
   // Allocate a void pointer on the stack.
   Value allocatePointer(OpBuilder &builder, Location loc) {
     auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
@@ -119,6 +136,8 @@
   Value generateKernelNameConstant(StringRef moduleName, StringRef name,
                                    Location loc, OpBuilder &builder);
   void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
+  void translateGpuMemcpyOp(mlir::gpu::MemcpyOp memcpyOp);
+  void translateGpuWaitOp(mlir::gpu::WaitOp waitOp);
 
 public:
   // Run the dialect converter on the module.
@@ -131,6 +150,11 @@
     getOperation().walk(
         [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
 
+    getOperation().walk(
+        [this](mlir::gpu::MemcpyOp op) { translateGpuMemcpyOp(op); });
+    getOperation().walk(
+        [this](mlir::gpu::WaitOp op) { translateGpuWaitOp(op); });
+
     // GPU kernel modules are no longer necessary since we have a global
     // constant with the CUBIN data.
     for (auto m :
@@ -147,6 +171,7 @@
   LLVM::LLVMType llvmInt32Type;
   LLVM::LLVMType llvmInt64Type;
   LLVM::LLVMType llvmIntPtrType;
+  LLVM::LLVMType llvmKernelNodeParamsType;
 };
 
 } // anonymous namespace
@@ -230,6 +255,58 @@
                                       },
                                       /*isVarArg=*/false));
   }
+  if (!module.lookupSymbol(cuGraphAddKernelNodeName)) {
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuGraphAddKernelNodeName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerPointerType(), /* CUgraphNode* graph_node */
+                getPointerType(),        /* CUgraph graph */
+                getPointerPointerType(), /* const CUgraphNode* dependencies */
+                getInt32Type(),          /* size_t numDependencies */
+                getKernelNodeParamsType()
+                    .getPointerTo(), /* CUDA_KERNEL_NODE_PARAMS* params */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuGraphAddMemcpyNodeName)) {
+    auto memrefTy = LLVM::LLVMType::getStructTy(
+        llvmDialect, {getInt64Type(), getPointerType()});
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuGraphAddMemcpyNodeName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerPointerType(), /* CUgraphNode* graph_node */
+                getPointerType(),        /* CUgraph graph */
+                getPointerPointerType(), /* const CUgraphNode* dependencies */
+                getInt32Type(),          /* size_t numDependencies */
+                memrefTy,                /* dst */
+                memrefTy,                /* src */
+                getInt64Type(),          /* size_t element_size */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuGraphExecuteName)) {
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuGraphExecuteName,
+        LLVM::LLVMType::getFunctionTy(
+            getCUResultType(),
+            {
+                getPointerType(),        /* CUgraph graph */
+                getPointerPointerType(), /* const CUgraphNode* dependencies */
+                getInt32Type(),          /* size_t numDependencies */
+            },
+            /*isVarArg=*/false));
+  }
+  if (!module.lookupSymbol(cuGetGraphHelperName)) {
+    // Helper function to get the current CUDA stream. Uses void* instead of
+    // CUDAs opaque CUstream.
+    builder.create<LLVM::LLVMFuncOp>(
+        loc, cuGetGraphHelperName,
+        LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
+  }
 }
 
 /// Emits the IR with the following structure:
@@ -426,14 +503,14 @@
       builder.getSymbolRefAttr(cuModuleGetFunction),
       ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
   // Grab the global stream needed for execution.
-  auto cuGetStreamHelper =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);
-  auto cuStream = builder.create<LLVM::CallOp>(
+  auto cuGetGraphHelper =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
+  auto cuGraph = builder.create<LLVM::CallOp>(
       loc, ArrayRef<Type>{getPointerType()},
-      builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});
+      builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});
   // Invoke the function with required arguments.
-  auto cuLaunchKernel =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);
+  auto cuGraphAddKernelNode =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphAddKernelNodeName);
   auto cuFunctionRef =
       builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
   auto paramsArray = setupParamsArray(launchOp, builder);
@@ -441,27 +518,142 @@
     launchOp.emitOpError() << "cannot pass given parameters to the kernel";
     return signalPassFailure();
   }
+
+  auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                              builder.getI32IntegerAttr(1));
+  auto kernelNodeParams = builder.create<LLVM::AllocaOp>(
+      loc, getKernelNodeParamsType().getPointerTo(), one, /*alignement=*/0);
+  auto setParamsValue = [&](Value value, size_t i) {
+    auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                  builder.getI32IntegerAttr(i));
+    auto ptrTy = value.getType().cast<LLVM::LLVMType>().getPointerTo();
+    auto gep = builder.create<LLVM::GEPOp>(loc, ptrTy, kernelNodeParams,
+                                           ArrayRef<Value>{zero, index});
+    builder.create<LLVM::StoreOp>(loc, value, gep);
+  };
+
+  setParamsValue(cuFunctionRef, 0);
+  for (const auto &en :
+       llvm::enumerate(launchOp.getOperands()
+                           .drop_front(launchOp.getChains().size())
+                           .take_front(launchOp.kNumConfigOperands))) {
+    auto truncOp =
+        builder.create<LLVM::TruncOp>(loc, getInt32Type(), en.value());
+    setParamsValue(truncOp, 1 + en.index());
+  }
+  setParamsValue(/*sharedMemSizeBytes=*/zero, 7);
+  setParamsValue(paramsArray, 8);
   auto nullpointer =
       builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
+  setParamsValue(nullpointer, 9);
+
+  auto chainsSize = builder.create<LLVM::ConstantOp>(
+      loc, getInt32Type(), builder.getI32IntegerAttr(launchOp.chains().size()));
+  auto chainsArray = builder.create<LLVM::AllocaOp>(
+      loc, getPointerPointerType(), chainsSize, /*alignment=*/0);
+  for (size_t i = 0; i < launchOp.chains().size(); ++i) {
+    auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                  builder.getI32IntegerAttr(i));
+    auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
+                                           chainsArray, ArrayRef<Value>{index});
+    builder.create<LLVM::StoreOp>(loc, launchOp.chains()[i], gep);
+  }
+
+  auto nodePointer = allocatePointer(builder, loc);
+
   builder.create<LLVM::CallOp>(
       loc, ArrayRef<Type>{getCUResultType()},
-      builder.getSymbolRefAttr(cuLaunchKernel),
-      ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),
-                      launchOp.getOperand(1), launchOp.getOperand(2),
-                      launchOp.getOperand(3), launchOp.getOperand(4),
-                      launchOp.getOperand(5), zero, /* sharedMemBytes */
-                      cuStream.getResult(0),        /* stream */
-                      paramsArray,                  /* kernel params */
-                      nullpointer /* extra */});
-  // Sync on the stream to make it synchronous.
-  auto cuStreamSync =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
-  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
-                               builder.getSymbolRefAttr(cuStreamSync),
-                               ArrayRef<Value>(cuStream.getResult(0)));
+      builder.getSymbolRefAttr(cuGraphAddKernelNode),
+      ArrayRef<Value>{nodePointer, cuGraph.getResult(0), chainsArray,
+                      chainsSize, kernelNodeParams});
+
+  Value nodeRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), nodePointer);
+
+  launchOp.replaceAllUsesWith(nodeRef);
   launchOp.erase();
 }
 
+void GpuLaunchFuncToCudaCallsPass::translateGpuMemcpyOp(
+    mlir::gpu::MemcpyOp memcpyOp) {
+  OpBuilder builder(memcpyOp);
+  Location loc = memcpyOp.getLoc();
+  declareCudaFunctions(loc);
+
+  auto chainsSize = builder.create<LLVM::ConstantOp>(
+      loc, getInt32Type(), builder.getI32IntegerAttr(memcpyOp.chains().size()));
+  auto chainsArray = builder.create<LLVM::AllocaOp>(
+      loc, getPointerPointerType(), chainsSize, /*alignment=*/0);
+  for (size_t i = 0; i < memcpyOp.chains().size(); ++i) {
+    auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                  builder.getI32IntegerAttr(i));
+    auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
+                                           chainsArray, ArrayRef<Value>{index});
+    builder.create<LLVM::StoreOp>(loc, memcpyOp.chains()[i], gep);
+  }
+
+  auto nodePointer = allocatePointer(builder, loc);
+
+  auto cuGetGraphHelper =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
+  auto cuGraph = builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getPointerType()},
+      builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});
+
+  auto elementSize = builder.create<LLVM::ConstantOp>(
+      loc, getInt64Type(),
+      builder.getI64IntegerAttr(memcpyOp.element_size()->getLimitedValue()));
+
+  auto cuGraphAddMemcpyNode =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphAddMemcpyNodeName);
+
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuGraphAddMemcpyNode),
+      ArrayRef<Value>{nodePointer, cuGraph.getResult(0), chainsArray,
+                      chainsSize, memcpyOp.dst(), memcpyOp.src(), elementSize});
+
+  Value nodeRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), nodePointer);
+
+  memcpyOp.replaceAllUsesWith(nodeRef);
+  memcpyOp.erase();
+}
+
+void GpuLaunchFuncToCudaCallsPass::translateGpuWaitOp(
+    mlir::gpu::WaitOp waitOp) {
+  OpBuilder builder(waitOp);
+  Location loc = waitOp.getLoc();
+  declareCudaFunctions(loc);
+
+  auto chainsSize = builder.create<LLVM::ConstantOp>(
+      loc, getInt32Type(), builder.getI32IntegerAttr(waitOp.chains().size()));
+  auto chainsArray = builder.create<LLVM::AllocaOp>(
+      loc, getPointerPointerType(), chainsSize, /*alignment=*/0);
+  for (size_t i = 0; i < waitOp.chains().size(); ++i) {
+    auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
+                                                  builder.getI32IntegerAttr(i));
+    auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
+                                           chainsArray, ArrayRef<Value>{index});
+    builder.create<LLVM::StoreOp>(loc, waitOp.chains()[i], gep);
+  }
+
+  auto cuGetGraphHelper =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
+  auto cuGraph = builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getPointerType()},
+      builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});
+
+  auto cuGraphExecute =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphExecuteName);
+
+  builder.create<LLVM::CallOp>(
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuGraphExecute),
+      ArrayRef<Value>{cuGraph.getResult(0), chainsArray, chainsSize});
+  waitOp.erase();
+}
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
   return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -725,8 +725,8 @@
   return LLVM::LLVMType::getVoidTy(&getDialect());
 }
 
-LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType() const {
-  return LLVM::LLVMType::getInt8PtrTy(&getDialect());
+LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType(unsigned addrspace) const {
+  return LLVM::LLVMType::getInt8Ty(&getDialect()).getPointerTo(addrspace);
 }
 
 Value ConvertToLLVMPattern::createIndexConstant(
@@ -1524,6 +1524,8 @@
 
     // Insert the malloc/aligned_alloc declaration if it is not already present.
     auto allocFuncName = useAlignedAlloc ? "aligned_alloc" : "malloc";
+    if (memRefType.getMemorySpace() == 1)
+      allocFuncName = "mcuMalloc";
     auto module = allocOp.getParentOfType<ModuleOp>();
     auto allocFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(allocFuncName);
     if (!allocFunc) {
@@ -1534,8 +1536,9 @@
         callArgTypes.push_back(getIndexType());
       allocFunc = moduleBuilder.create<LLVM::LLVMFuncOp>(
           rewriter.getUnknownLoc(), allocFuncName,
-          LLVM::LLVMType::getFunctionTy(getVoidPtrType(), callArgTypes,
-                                        /*isVarArg=*/false));
+          LLVM::LLVMType::getFunctionTy(
+              getVoidPtrType(memRefType.getMemorySpace()), callArgTypes,
+              /*isVarArg=*/false));
     }
 
     // Allocate the underlying buffer and store a pointer to it in the MemRef
@@ -1570,8 +1573,9 @@
     }
     auto allocFuncSymbol = rewriter.getSymbolRefAttr(allocFunc);
     allocatedBytePtr = rewriter
-                           .create<LLVM::CallOp>(loc, getVoidPtrType(),
-                                                 allocFuncSymbol, callArgs)
+                           .create<LLVM::CallOp>(
+                               loc, getVoidPtrType(memRefType.getMemorySpace()),
+                               allocFuncSymbol, callArgs)
                            .getResult(0);
     // For heap allocations, the allocated pointer is a cast of the byte pointer
     // to the type pointer.
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -14,12 +14,14 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 
 using namespace mlir;
 using namespace mlir::gpu;
@@ -35,12 +37,38 @@
 
 GPUDialect::GPUDialect(MLIRContext *context)
     : Dialect(getDialectNamespace(), context) {
+  addTypes<ChainType>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/GPUOps.cpp.inc"
       >();
 }
 
+Type GPUDialect::parseType(DialectAsmParser &parser) const {
+  // Parse the main keyword for the type.
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword))
+    return Type();
+  MLIRContext *context = getContext();
+
+  // Handle 'chain' types.
+  if (keyword == "chain")
+    return ChainType::get(context);
+
+  parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
+  return Type();
+}
+
+void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
+  switch (type.getKind()) {
+  default:
+    llvm_unreachable("Unhandled gpu type");
+  case GpuTypes::Chain:
+    os << "chain";
+    break;
+  }
+}
+
 LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
                                                    NamedAttribute attr) {
   if (!attr.second.isa<UnitAttr>() ||
@@ -201,9 +229,11 @@
 // LaunchOp
 //===----------------------------------------------------------------------===//
 
-void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX,
-                     Value gridSizeY, Value gridSizeZ, Value blockSizeX,
-                     Value blockSizeY, Value blockSizeZ) {
+void LaunchOp::build(Builder *builder, OperationState &result,
+                     ValueRange chains, Value gridSizeX, Value gridSizeY,
+                     Value gridSizeZ, Value blockSizeX, Value blockSizeY,
+                     Value blockSizeZ) {
+  result.addOperands(chains);
   // Add grid and block sizes as op operands, followed by the data operands.
   result.addOperands(
       {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
@@ -217,6 +247,12 @@
       std::vector<Type>(kNumConfigRegionAttributes, builder->getIndexType()));
   kernelRegion->push_back(body);
 }
+void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX,
+                     Value gridSizeY, Value gridSizeZ, Value blockSizeX,
+                     Value blockSizeY, Value blockSizeZ) {
+  build(builder, result, {}, gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
+        blockSizeY, blockSizeZ);
+}
 
 KernelDim3 LaunchOp::getBlockIds() {
   assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
@@ -243,11 +279,20 @@
 }
 
 KernelDim3 LaunchOp::getGridSizeOperandValues() {
-  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+  auto operands = getOperands().drop_front(getChains().size());
+  return KernelDim3{operands[0], operands[1], operands[2]};
 }
 
 KernelDim3 LaunchOp::getBlockSizeOperandValues() {
-  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+  auto operands = getOperands().drop_front(getChains().size());
+  return KernelDim3{operands[3], operands[4], operands[5]};
+}
+
+OperandRange LaunchOp::getChains() {
+  auto operands = getOperands();
+  return {operands.begin(), llvm::find_if_not(operands, [](const Value &op) {
+            return op.getType().isa<ChainType>();
+          })};
 }
 
 static LogicalResult verify(LaunchOp op) {
@@ -256,8 +301,9 @@
   // for block/thread identifiers and grid/block sizes.
   if (!op.body().empty()) {
     Block &entryBlock = op.body().front();
-    if (entryBlock.getNumArguments() !=
-        LaunchOp::kNumConfigOperands + op.getNumOperands())
+    if (entryBlock.getNumArguments() - LaunchOp::kNumConfigRegionAttributes !=
+        op.getNumOperands() - op.getChains().size() -
+            LaunchOp::kNumConfigOperands)
       return op.emitOpError("unexpected number of region arguments");
   }
 
@@ -286,22 +332,27 @@
 //   (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
 // where %size-* and %iter-* will correspond to the body region arguments.
 static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
-                                ValueRange operands, KernelDim3 ids) {
+                                KernelDim3 operands, KernelDim3 ids) {
   p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";
-  p << size.x << " = " << operands[0] << ", ";
-  p << size.y << " = " << operands[1] << ", ";
-  p << size.z << " = " << operands[2] << ')';
+  p << size.x << " = " << operands.x << ", ";
+  p << size.y << " = " << operands.y << ", ";
+  p << size.z << " = " << operands.z << ')';
 }
 
 static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
-  ValueRange operands = op.getOperands();
-
   // Print the launch configuration.
-  p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();
-  printSizeAssignment(p, op.getGridSize(), operands.take_front(3),
+  p << LaunchOp::getOperationName();
+  // Note: only print [] if op returns a chain for backwards compatibility.
+  if (!op.getResults().empty()) {
+    p << '[';
+    p.printOperands(op.getChains());
+    p << ']';
+  }
+  p << ' ' << op.getBlocksKeyword();
+  printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(),
                       op.getBlockIds());
   p << ' ' << op.getThreadsKeyword();
-  printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),
+  printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(),
                       op.getThreadIds());
 
   p.printRegion(op.body(), /*printEntryBlockArgs=*/false);
@@ -344,6 +395,17 @@
 //                            region attr-dict?
 // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
 static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
+  // Note: only return chain if it contains '[]' for backwards compatibility.
+  if (!parser.parseOptionalLSquare()) {
+    SmallVector<OpAsmParser::OperandType, 1> chains;
+    auto chainTy = ChainType::get(parser.getBuilder().getContext());
+    if (parser.parseOperandList(chains) ||
+        parser.resolveOperands(chains, chainTy, result.operands) ||
+        parser.parseOptionalRSquare() ||
+        parser.addTypeToList(chainTy, result.types))
+      return failure();
+  }
+
   // Sizes of the grid and block.
   SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(
       LaunchOp::kNumConfigOperands);
@@ -390,9 +452,12 @@
 //===----------------------------------------------------------------------===//
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
-                         GPUFuncOp kernelFunc, Value gridSizeX, Value gridSizeY,
-                         Value gridSizeZ, Value blockSizeX, Value blockSizeY,
-                         Value blockSizeZ, ValueRange kernelOperands) {
+                         ValueRange chains, GPUFuncOp kernelFunc,
+                         Value gridSizeX, Value gridSizeY, Value gridSizeZ,
+                         Value blockSizeX, Value blockSizeY, Value blockSizeZ,
+                         ValueRange kernelOperands) {
+  result.types.push_back(ChainType::get(builder->getContext()));
+  result.addOperands(chains);
   // Add grid and block sizes as op operands, followed by the data operands.
   result.addOperands(
       {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
@@ -402,12 +467,18 @@
   auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>();
   result.addAttribute(getKernelModuleAttrName(),
                       builder->getSymbolRefAttr(kernelModule.getName()));
+  SmallVector<int32_t, 8> operandSegmentSizes(8, 1);
+  operandSegmentSizes.front() = static_cast<int32_t>(chains.size());
+  operandSegmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder->getI32VectorAttr(operandSegmentSizes));
 }
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
-                         GPUFuncOp kernelFunc, KernelDim3 gridSize,
-                         KernelDim3 blockSize, ValueRange kernelOperands) {
-  build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
+                         ValueRange chains, GPUFuncOp kernelFunc,
+                         KernelDim3 gridSize, KernelDim3 blockSize,
+                         ValueRange kernelOperands) {
+  build(builder, result, chains, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
         blockSize.x, blockSize.y, blockSize.z, kernelOperands);
 }
 
@@ -416,7 +487,8 @@
 }
 
 unsigned LaunchFuncOp::getNumKernelOperands() {
-  return getNumOperands() - kNumConfigOperands;
+  // TODO(csigg): Use the attribute from AttrSizedOperandSegments?
+  return getNumOperands() - getChains().size() - kNumConfigOperands;
 }
 
 StringRef LaunchFuncOp::getKernelModuleName() {
@@ -425,15 +497,24 @@
 }
 
 Value LaunchFuncOp::getKernelOperand(unsigned i) {
-  return getOperation()->getOperand(i + kNumConfigOperands);
+  return getOperand(getChains().size() + kNumConfigOperands + i);
 }
 
 KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
-  return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
+  auto operands = getOperands().drop_front(getChains().size());
+  return KernelDim3{operands[0], operands[1], operands[2]};
 }
 
 KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
-  return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
+  auto operands = getOperands().drop_front(getChains().size());
+  return KernelDim3{operands[3], operands[4], operands[5]};
+}
+
+OperandRange LaunchFuncOp::getChains() {
+  auto num_chains =
+      *getAttrOfType<DenseIntElementsAttr>(getOperandSegmentSizeAttr()).begin();
+  auto begin = getOperands().begin();
+  return {begin, begin + num_chains.getLimitedValue()};
 }
 
 static LogicalResult verify(LaunchFuncOp op) {
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -198,9 +198,11 @@
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
   OpBuilder builder(launchOp);
-  builder.create<gpu::LaunchFuncOp>(
-      launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
-      launchOp.getBlockSizeOperandValues(), operands);
+  auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
+      launchOp.getLoc(), launchOp.getChains(), kernelFunc,
+      launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(),
+      operands);
+  launchOp.replaceAllUsesWith(launchFuncOp.getOperation());
   launchOp.erase();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -497,8 +497,6 @@
 /// definitions.
 LogicalResult ModuleTranslation::convertGlobals() {
   // Lock access to the llvm context.
-  llvm::sys::SmartScopedLock<true> scopedLock(
-      llvmDialect->getLLVMContextMutex());
   for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
     llvm::Type *type = op.getType().getUnderlyingType();
     llvm::Constant *cst = llvm::UndefValue::get(type);
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s --dump-input-on-failure
 
 module attributes {gpu.container_module} {
 
@@ -83,6 +83,18 @@
     return
   }
 
+  func @async() {
+    %size = constant 32 : index
+    // CHECK: %{{.*}} = gpu.alloc (%{{.*}}) : memref<?xf32>
+    %b0 = gpu.alloc (%size) : memref<?xf32>
+    // CHECK: %{{.*}}, %{{.*}} = gpu.alloc[] (%{{.*}}) : memref<?xf32>
+    %b1, %t1 = gpu.alloc[] (%size) : memref<?xf32>
+    // CHECK: %{{.*}}, %{{.*}} = gpu.alloc[%{{.*}}] (%{{.*}}) : memref<?xf32>
+    %b2, %t2 = gpu.alloc[%t1] (%size) : memref<?xf32>
+
+    return
+  }
+
   module @gpu_funcs attributes {gpu.kernel_module} {
     // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
     // CHECK:       workgroup
diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/mlir-cuda-runner/async.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/async.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @main() {
+  %one = constant 1 : index
+  %count = constant 4 : index
+  // b0 = iota (init)
+  %b0 = alloc(%count) : memref<?xi32, 1>
+  %t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
+    %val = index_cast %tx : index to i32
+    store %val, %b0[%tx] : memref<?xi32, 1>
+    gpu.terminator
+  }
+  // b1 = iota (init)
+  %b1 = alloc(%count) : memref<?xi32, 1>
+  %t2 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
+    %val = index_cast %tx : index to i32
+    store %val, %b1[%tx] : memref<?xi32, 1>
+    gpu.terminator
+  }
+  // b2 = b0 + b1 (join)
+  %b2 = alloc(%count) : memref<?xi32, 1>
+  %t3 = gpu.launch[%t1, %t2] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
+    %v0 = load %b0[%tx] : memref<?xi32, 1>
+    %v1 = load %b1[%tx] : memref<?xi32, 1>
+    %sum = addi %v0, %v1 : i32
+    store %sum, %b2[%tx] : memref<?xi32, 1>
+    gpu.terminator
+  }
+  // copy b2 to h0 and h1 (fork)
+  %h0 = alloc(%count) : memref<?xi32>
+  %h1 = alloc(%count) : memref<?xi32>
+  %g0 = memref_cast %h0 : memref<?xi32> to memref<*xi32>
+  %g1 = memref_cast %h1 : memref<?xi32> to memref<*xi32>
+  %c2 = memref_cast %b2 : memref<?xi32, 1> to memref<*xi32, 1>
+  %t4 = gpu.memcpy[%t3] (%g0, %c2) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1>
+  %t5 = gpu.memcpy[%t3] (%g1, %c2) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1>
+  // wait for copies to complete (sync)
+  gpu.wait[%t4, %t5]
+  // print result
+  // CHECK: [0 2 4 8]
+  call @print_memref_i32(%g0) : (memref<*xi32>) -> ()
+  // CHECK: [0 2 4 8]
+  call @print_memref_i32(%g1) : (memref<*xi32>) -> ()
+  return
+}
+
+func @print_memref_i32(memref<*xi32>)
diff --git a/mlir/test/mlir-cuda-runner/simple.mlir b/mlir/test/mlir-cuda-runner/simple.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/simple.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @main() {
+  %one = constant 1 : index
+  %count = constant 4 : index
+  // b0 = iota (init)
+  %b0 = alloc(%count) : memref<?xi32, 1>
+  %t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
+    %val = index_cast %tx : index to i32
+    store %val, %b0[%tx] : memref<?xi32, 1>
+    gpu.terminator
+  }
+  // copy b0 to h0
+  %h0 = alloc(%count) : memref<?xi32>
+  %g0 = memref_cast %h0 : memref<?xi32> to memref<*xi32>
+  %c0 = memref_cast %b0 : memref<?xi32, 1> to memref<*xi32, 1>
+  %t2 = gpu.memcpy[%t1] (%g0, %c0) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1>
+  // wait for copy to complete (sync)
+  gpu.wait[%t2]
+  // print result
+  // CHECK: [0, 1, 2, 3]
+  call @print_memref_i32(%g0) : (memref<*xi32>) -> ()
+  return
+}
+
+func @print_memref_i32(memref<*xi32>)
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -21,7 +21,7 @@
 #include "cuda.h"
 
 namespace {
-int32_t reportErrorIfAny(CUresult result, const char *where) {
+CUresult reportErrorIfAny(CUresult result, const char *where) {
   if (result != CUDA_SUCCESS) {
     llvm::errs() << "CUDA failed with " << result << " in " << where << "\n";
   }
@@ -65,6 +65,72 @@
   return stream;
 }
 
+extern "C" CUgraph mcuGetGraphHelper() {
+  static auto result = [] {
+    CUgraph graph = nullptr;
+    reportErrorIfAny(cuGraphCreate(&graph, 0), "cuGraphCreate");
+    return graph;
+  }();
+  return result;
+}
+extern "C" CUresult mcuGraphExecute(CUgraph graph) {
+  CUgraphExec exec = nullptr;
+  reportErrorIfAny(cuGraphInstantiate(&exec, graph, /*phErrorNode=*/NULL,
+                                      /*logBuffer=*/NULL, /*bufferSize=*/0),
+                   "cuGraphInstantiate");
+  reportErrorIfAny(cuGraphLaunch(exec, nullptr), "cuGraphLaunch");
+  return reportErrorIfAny(cuGraphExecDestroy(exec), "cuGraphExecDestroy");
+}
+extern "C" CUresult mcuGraphAddKernelNode(CUgraphNode *node, CUgraph graph,
+                                          const CUgraphNode *deps,
+                                          int32_t num_deps,
+                                          CUDA_KERNEL_NODE_PARAMS *params) {
+  return reportErrorIfAny(
+      cuGraphAddKernelNode(node, graph, deps, num_deps, params),
+      "cuGraphAddKernelNode");
+}
+
+struct StridedMemRefDesc {
+  void *basePtr;
+  CUdeviceptr data;
+  int64_t offset;
+  int64_t sizes[4];
+};
+
+extern "C" CUresult
+mcuGraphAddMemcpyNode(CUgraphNode *node, CUgraph graph, const CUgraphNode *deps,
+                      int32_t num_deps, int64_t dst_rank,
+                      StridedMemRefDesc *dst_desc, int64_t src_rank,
+                      StridedMemRefDesc *src_desc, size_t element_size_bytes) {
+  CUcontext ctx;
+  reportErrorIfAny(cuCtxGetCurrent(&ctx), "cuCtxGetCurrent");
+  auto get_size = [](int64_t rank, StridedMemRefDesc *desc) {
+    const int64_t *begin = desc->sizes;
+    return std::accumulate(begin, begin + rank, 1, std::multiplies<int64_t>());
+  };
+  int64_t src_size = get_size(src_rank, src_desc);
+  assert(src_size == get_size(dst_rank, dst_desc));
+
+  CUDA_MEMCPY3D copy_params = {};
+  copy_params.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
+  copy_params.srcDevice = src_desc->data;
+  copy_params.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
+  copy_params.dstDevice = dst_desc->data;
+  copy_params.WidthInBytes = src_size * element_size_bytes;
+  copy_params.Height = copy_params.Depth = 1;
+
+  return reportErrorIfAny(
+      cuGraphAddMemcpyNode(node, graph, deps, num_deps, &copy_params, ctx),
+      "cuGraphAddMemcpyNode");
+}
+
+extern "C" void *mcuMalloc(size_t size) {
+  void *ptr;
+  reportErrorIfAny(cuMemAlloc(reinterpret_cast<CUdeviceptr *>(&ptr), size),
+                   "cuMemAlloc");
+  return ptr;
+}
+
 extern "C" int32_t mcuStreamSynchronize(void *stream) {
   return reportErrorIfAny(
       cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");