diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -858,6 +858,8 @@
   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
+  let hasCanonicalizer = 1;
+
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
   }];
@@ -970,6 +972,7 @@
   }];
   let hasFolder = 1;
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 def GPU_MemsetOp : GPU_Op<"memset",
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1064,6 +1064,44 @@
   printer << "]";
 }
 
+namespace {
+
+/// Erases a common case of copy ops where a destination value is used only by
+/// the copy op, alloc and dealloc ops.
+struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
+  using OpRewritePattern<MemcpyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MemcpyOp op,
+                                PatternRewriter &rewriter) const override {
+    Value dest = op.dst();
+    // If `dest` is a block argument, we cannot remove `op`.
+    if (dest.isa<BlockArgument>())
+      return failure();
+    auto isDeallocLikeOp = [](Operation *op) {
+      auto memOp = dyn_cast<MemoryEffectOpInterface>(op);
+      return memOp && memOp.hasEffect<MemoryEffects::Free>();
+    };
+    // We can erase `op` iff `dest` has no other use apart from its
+    // use by `op` and dealloc ops.
+    if (llvm::any_of(dest.getUsers(), [isDeallocLikeOp, op](Operation *user) {
+          return user != op && !isDeallocLikeOp(user);
+        }))
+      return failure();
+
+    // Check that the async token we are going to erase has no other uses.
+    if (!op.asyncToken() || op.asyncToken().use_empty())
+      rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+} // end anonymous namespace
+
+void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                           MLIRContext *context) {
+  results.add<EraseTrivialCopyOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // GPU_SubgroupMmaLoadMatrixOp
 //===----------------------------------------------------------------------===//
@@ -1262,6 +1300,56 @@
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_WaitOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Fold away redundant gpu.wait ops of the following pattern.
+/// %t = gpu.wait async
+/// gpu.wait [%t]
+struct FoldRedundantGpuWait : public OpRewritePattern<WaitOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(WaitOp op,
+                                PatternRewriter &rewriter) const final {
+    // We check whether `op` produce result `asynctoken`.
+    Value token = op.asyncToken();
+    if (!token)
+      return failure();
+    // We check whether `op` has any async dependencies or not.
+    if (!op.asyncDependencies().empty())
+      return failure();
+    // If token do not have single use, we cannot fold away gpu.wait ops.
+    if (!token.hasOneUse())
+      return failure();
+    // If the only op operating on `token` is not a gpu.wait op, we cannot fold
+    // away gpu.wait ops.
+    auto waitOp = dyn_cast<mlir::gpu::WaitOp>(*token.user_begin());
+    if (!waitOp)
+      return failure();
+    // If `waitOp` produces any token, we cannot fold away the gpu.wait ops.
+    if (waitOp.asyncToken())
+      return failure();
+    // `waitOp` should have only single async dependency.
+    if (!llvm::hasSingleElement(waitOp.asyncDependencies()))
+      return failure();
+
+    rewriter.eraseOp(*token.user_begin());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+} // end anonymous namespace
+
+void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                         MLIRContext *context) {
+  results.add<FoldRedundantGpuWait>(context);
+}
+
 #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc"
 #include "mlir/Dialect/GPU/GPUOpsEnums.cpp.inc"
 
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -1,5 +1,28 @@
 // RUN: mlir-opt %s -canonicalize --split-input-file -allow-unregistered-dialect | FileCheck %s
 
+// CHECK-LABEL: func @fold_memcpy_op
+func @fold_memcpy_op(%arg0: i1) {
+    %cst = arith.constant 3.343820e-05 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %1 = memref.alloc() : memref<400x1024x1024x1xf16>
+    %2 = gpu.wait async 
+    %memref, %asyncToken = gpu.alloc async [%2] () : memref<400x1024x1024x1xf16>
+    gpu.wait [%2]
+    affine.store %cst, %memref[0, 0, 0, 0] : memref<400x1024x1024x1xf16>
+    %3 = gpu.wait async 
+    %4 = gpu.memcpy async [%3] %1, %memref : memref<400x1024x1024x1xf16>, memref<400x1024x1024x1xf16>
+    gpu.wait [%3]
+    %5 = scf.if %arg0 -> (i1) {
+      memref.dealloc %1 : memref<400x1024x1024x1xf16>
+      scf.yield %arg0 : i1
+    } else {
+      memref.dealloc %1 : memref<400x1024x1024x1xf16>
+      scf.yield %arg0 : i1
+    }  
+    return
+}
+// CHECK-NOT: gpu.memcpy
+
 // CHECK-LABEL: @memcpy_after_cast
 func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   // CHECK-NOT: memref.cast