diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -861,6 +861,8 @@
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
   }];
+
+  let hasCanonicalizer = 1;
 }
 
 def GPU_AllocOp : GPU_Op<"alloc", [
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1170,6 +1170,89 @@
   return foldMemRefCast(*this);
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_WaitOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Fold away redundant gpu.wait ops of the following pattern.
+/// %t = gpu.wait async
+/// gpu.wait [%t]
+struct EraseRedundantGpuWaitOpPairs : public OpRewritePattern<WaitOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(WaitOp op,
+                                PatternRewriter &rewriter) const final {
+    // We check whether `op` produce result `asyncToken`.
+    Value token = op.asyncToken();
+    if (!token)
+      return failure();
+    // We check whether `op` has any async dependencies or not.
+    if (!op.asyncDependencies().empty())
+      return failure();
+    // If token do not have single use, we cannot fold away gpu.wait ops.
+    if (!token.hasOneUse())
+      return failure();
+    // If the only op operating on `token` is not a gpu.wait op, we cannot fold
+    // away gpu.wait ops.
+    auto tokenUser = dyn_cast<mlir::gpu::WaitOp>(*token.user_begin());
+    if (!tokenUser)
+      return failure();
+    // If `waitOp` produces any token, we cannot fold away the gpu.wait ops.
+    if (tokenUser.asyncToken())
+      return failure();
+    // `waitOp` should have only single async dependency.
+    if (!llvm::hasSingleElement(tokenUser.asyncDependencies()))
+      return failure();
+
+    rewriter.eraseOp(tokenUser);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+// clang-format off
+/// Simplify trivial gpu.wait ops for the following patterns.
+/// 1. %t = gpu.wait async ... ops, where %t has no uses.
+/// 2. gpu.wait [] ops, i.e gpu.wait ops that neither have any async dependencies nor return any token.
+/// 3. %t1 = gpu.wait async [%t0], in this case, we can replace uses of %t1 with %t0.
+// clang-format on
+struct SimplifyGpuWaitOp : public OpRewritePattern<WaitOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(WaitOp op,
+                                PatternRewriter &rewriter) const final {
+    // Erase gpu.wait [] ops, i.e gpu.wait ops which neither have any async
+    // dependencies nor return any token.
+    if (op.asyncDependencies().empty() && !op.asyncToken()) {
+      rewriter.eraseOp(op);
+      return success();
+    }
+    // Erase %t1 = gpu.wait async [%t0] ops. In this case, we can replace uses
+    // of %t1 with %t0.
+    if (llvm::hasSingleElement(op.asyncDependencies()) && op.asyncToken()) {
+      rewriter.replaceOp(op, op.asyncDependencies());
+      return success();
+    }
+    // Erase %t = gpu.wait async ... ops, where %t has no uses.
+    if (op.asyncToken() && op.asyncToken().use_empty()) {
+      rewriter.eraseOp(op);
+      return success();
+    }
+    return failure();
+  }
+};
+
+} // end anonymous namespace
+
+void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                         MLIRContext *context) {
+  results.add<EraseRedundantGpuWaitOpPairs, SimplifyGpuWaitOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // GPU_AllocOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -1,5 +1,32 @@
 // RUN: mlir-opt %s -canonicalize --split-input-file -allow-unregistered-dialect | FileCheck %s
 
+// Fold all the gpu.wait ops as they are redundant.
+ // CHECK-LABEL: func @fold_wait_op_test1
+ func @fold_wait_op_test1() {
+   %1 = gpu.wait async
+   gpu.wait []
+   %3 = gpu.wait async 
+   gpu.wait [%3]
+   return
+ }
+ // CHECK-NOT: gpu.wait
+
+ // Replace uses of gpu.wait op with its async dependency.
+ // CHECK-LABEL: func @fold_wait_op_test2
+ func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
+   %0 = gpu.wait async
+   %memref, %asyncToken = gpu.alloc async [%0] () : memref<5xf16>
+   gpu.wait [%0]
+   %1 = gpu.wait async [%0]
+   %memref_0, %asyncToken_0 = gpu.alloc async [%1] () : memref<5xf16>
+   gpu.wait [%1]
+   return %memref, %memref_0 : memref<5xf16>, memref<5xf16>
+ }
+ // CHECK: %[[TOKEN:.*]] = gpu.wait async
+ // CHECK: gpu.wait [%[[TOKEN]]]
+ // CHECK-NOT: gpu.wait async
+ // CHECK: gpu.wait [%[[TOKEN]]]
+
 // CHECK-LABEL: @memcpy_after_cast
 func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   // CHECK-NOT: memref.cast