diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1052,6 +1052,38 @@
   let hasFolder = 1;
 }
 
+def GPU_MemzeroOp : GPU_Op<"memzero", [GPU_AsyncOpInterface]> {
+
+  let summary = "GPU memzero operation";
+
+  let description = [{
+    The `gpu.memzero` operation sets the content of memref to zero value.
+
+    The op does not execute before all async dependencies have finished
+    executing.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token.
+
+    Example:
+
+    ```mlir
+    %token = gpu.memzero async [%dep] %dst : memref<?xf32, 1>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Arg<AnyMemRef, "", [MemWrite]>:$dst);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $dst `:` type($dst) attr-dict
+  }];
+  let hasFolder = 1;
+}
+
 def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device",
                                     [MemoryEffects<[MemWrite]>]>,
     Arguments<(ins I32:$devIndex)> {
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1268,6 +1268,11 @@
   return foldMemRefCast(*this);
 }
 
+LogicalResult MemzeroOp::fold(ArrayRef<Attribute> operands,
+                              SmallVectorImpl<::mlir::OpFoldResult> &results) {
+  return foldMemRefCast(*this);
+}
+
 //===----------------------------------------------------------------------===//
 // GPU_WaitOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -111,6 +111,15 @@
   return
 }
 
+// CHECK-LABEL: @memzero_after_cast
+func.func @memzero_after_cast(%arg0: memref<10xf32>) {
+  // CHECK-NOT: memref.cast
+  // CHECK: gpu.memzero
+  %0 = memref.cast %arg0 : memref<10xf32> to memref<?xf32>
+  gpu.memzero %0 : memref<?xf32>
+  return
+}
+
 // -----
 
 // Test case: Folding of memref.dim(gpu.alloc(%size), %idx) -> %size
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -257,6 +257,17 @@
     return
   }
 
+  func.func @memzero(%dst : memref<3x7xf32>) {
+    // CHECK-LABEL: func @memzero
+    // CHECK: gpu.memzero {{.*}} : memref<3x7xf32>
+    gpu.memzero %dst : memref<3x7xf32>
+    // CHECK: %[[t0:.*]] = gpu.wait async
+    %0 = gpu.wait async
+    // CHECK: {{.*}} = gpu.memzero async [%[[t0]]] {{.*}} : memref<3x7xf32>
+    %1 = gpu.memzero async [%0] %dst : memref<3x7xf32>
+    return
+  }
+
   func.func @mmamatrix_valid_element_type(%src : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>>){
     // CHECK-LABEL: func @mmamatrix_valid_element_type
     %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>