diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -999,6 +999,7 @@
   }];
   let hasFolder = 1;
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 def GPU_MemsetOp : GPU_Op<"memset",
diff --git a/mlir/include/mlir/Interfaces/SideEffectInterfaces.h b/mlir/include/mlir/Interfaces/SideEffectInterfaces.h
--- a/mlir/include/mlir/Interfaces/SideEffectInterfaces.h
+++ b/mlir/include/mlir/Interfaces/SideEffectInterfaces.h
@@ -248,6 +248,10 @@
 // SideEffect Utilities
 //===----------------------------------------------------------------------===//
 
+/// Returns true if this operation only has the given effect on `value`.
+template <typename EffectTy>
+bool hasSingleEffect(Operation *op, Value value);
+
 /// Return true if the given operation is unused, and has no side effects on
 /// memory that prevent erasing.
 bool isOpTriviallyDead(Operation *op);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -1064,6 +1065,45 @@
   printer << "]";
 }
 
+namespace {
+
+/// Erases a common case of copy ops where a destination value is used only by
+/// the copy op, alloc and dealloc ops.
+struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
+  using OpRewritePattern<MemcpyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MemcpyOp op,
+                                PatternRewriter &rewriter) const override {
+    Value dest = op.dst();
+    Operation *destDefOp = dest.getDefiningOp();
+    // `dest` must be defined by an op having Allocate memory effect in order to
+    // perform the folding.
+    if (!destDefOp ||
+        !hasSingleEffect<MemoryEffects::Allocate>(destDefOp, dest))
+      return failure();
+    // We can erase `op` iff `dest` has no other use apart from its
+    // use by `op` and dealloc ops.
+    if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) {
+          return user != op &&
+                 !hasSingleEffect<MemoryEffects::Free>(user, dest);
+        }))
+      return failure();
+    if (op.asyncDependencies().size() > 1 ||
+        ((op.asyncDependencies().empty() && op.asyncToken()) ||
+         (!op.asyncDependencies().empty() && !op.asyncToken())))
+      return failure();
+    rewriter.replaceOp(op, op.asyncDependencies());
+    return success();
+  }
+};
+
+} // end anonymous namespace
+
+void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                           MLIRContext *context) {
+  results.add<EraseTrivialCopyOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // GPU_SubgroupMmaLoadMatrixOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/SideEffectInterfaces.cpp b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
--- a/mlir/lib/Interfaces/SideEffectInterfaces.cpp
+++ b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
@@ -90,6 +90,32 @@
   return true;
 }
 
+template <typename EffectTy>
+bool mlir::hasSingleEffect(Operation *op, Value value) {
+  llvm::SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4>
+      effects;
+  auto memOp = dyn_cast<MemoryEffectOpInterface>(op);
+  if (!memOp)
+    return false;
+  memOp.getEffects(effects);
+  bool doesOpOnlyHaveSingleEffectOnVal = false;
+  for (auto &effect : effects) {
+    if (effect.getValue() == value && isa<EffectTy>(effect.getEffect()))
+      doesOpOnlyHaveSingleEffectOnVal = true;
+    if (effect.getValue() == value && !isa<EffectTy>(effect.getEffect())) {
+      doesOpOnlyHaveSingleEffectOnVal = false;
+      break;
+    }
+  }
+  return doesOpOnlyHaveSingleEffectOnVal;
+}
+
+template bool mlir::hasSingleEffect<MemoryEffects::Allocate>(Operation *,
+                                                             Value);
+template bool mlir::hasSingleEffect<MemoryEffects::Free>(Operation *, Value);
+template bool mlir::hasSingleEffect<MemoryEffects::Write>(Operation *, Value);
+template bool mlir::hasSingleEffect<MemoryEffects::Read>(Operation *, Value);
+
 bool mlir::wouldOpBeTriviallyDead(Operation *op) {
   if (op->mightHaveTrait<OpTrait::IsTerminator>())
     return false;
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -28,6 +28,70 @@
 // CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
 // CHECK-NEXT: return
 
+// CHECK-LABEL: func @fold_memcpy_op
+func @fold_memcpy_op(%arg0: i1) {
+    %cst = arith.constant 0.000000e+00 : f16
+    %1 = memref.alloc() : memref<2xf16>
+    %2 = gpu.wait async
+    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+    gpu.wait [%2]
+    affine.store %cst, %memref[0] : memref<2xf16>
+    %3 = gpu.wait async
+    %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
+    gpu.wait [%3]
+    %5 = scf.if %arg0 -> (i1) {
+      memref.dealloc %1 : memref<2xf16>
+      scf.yield %arg0 : i1
+    } else {
+      memref.dealloc %1 : memref<2xf16>
+      scf.yield %arg0 : i1
+    }
+    return
+}
+// CHECK-NOT: gpu.memcpy
+
+// We cannot fold memcpy here as dest is a block argument.
+// CHECK-LABEL: func @do_not_fold_memcpy_op1
+func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
+    %cst = arith.constant 0.000000e+00 : f16
+    %2 = gpu.wait async
+    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+    gpu.wait [%2]
+    affine.store %cst, %memref[0] : memref<2xf16>
+    %3 = gpu.wait async
+    %4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16>
+    gpu.wait [%3]
+    return
+}
+// CHECK: gpu.memcpy
+
+// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
+// CHECK-LABEL: func @do_not_fold_memcpy_op2
+func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
+    %cst = arith.constant 0.000000e+00 : f16
+    %1 = memref.alloc() : memref<2xf16>
+    %2 = gpu.wait async
+    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+    gpu.wait [%2]
+    affine.store %cst, %memref[0] : memref<2xf16>
+    %3 = gpu.wait async
+    %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
+    gpu.wait [%3]
+    %5 = memref.load %1[%arg1] : memref<2xf16>
+    return %5 : f16
+}
+// CHECK: gpu.memcpy
+
+// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
+// CHECK-LABEL: func @do_not_fold_memcpy_op3
+func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
+  %0 = arith.constant 0 : index
+  %1 = memref.view %arg0[%0][] : memref<1xi8> to memref<i1>
+  gpu.memcpy  %1, %arg1 : memref<i1>, memref<i1>
+  func.return
+}
+// CHECK: gpu.memcpy
+
 // CHECK-LABEL: @memcpy_after_cast
 func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   // CHECK-NOT: memref.cast
diff --git a/utils/arcanist/clang-format.sh b/utils/arcanist/clang-format.sh
--- a/utils/arcanist/clang-format.sh
+++ b/utils/arcanist/clang-format.sh
@@ -51,7 +51,7 @@
 # We do not look for clang-format-diff or clang-format-diff.py in the PATH
 # because whether/how these are installed differs between distributions,
 # and we have an executable copy in the tree anyway.
-arc_base_commit=$(arc which --show-base)
+arc_base_commit=$(/opt/arcanist/bin/arc which --show-base)
 git diff-index -U0 "${arc_base_commit}" "${src_file}" \
   | clang/tools/clang-format/clang-format-diff.py -style file -i -p1