diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -970,6 +970,7 @@ }]; let hasFolder = 1; let hasVerifier = 1; + let hasCanonicalizer = 1; } def GPU_MemsetOp : GPU_Op<"memset", diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1064,6 +1064,58 @@ printer << "]"; } +namespace { + +/// Erases a common case of copy ops where a destination value is used only by +/// the copy op, alloc and dealloc ops. +struct EraseTrivialCopyOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(MemcpyOp op, + PatternRewriter &rewriter) const override { + Value dest = op.dst(); + // If `dest` is a block argument, we canoot remove `op`. + if (dest.isa()) + return failure(); + auto isDeallocLikeOp = [](Operation *op) { + auto memOp = dyn_cast(op); + return memOp && memOp.hasEffect(); + }; + // We can erase `op` iff `dest` has no othet use apart from its + // use by `op` and dealloc ops. + if (llvm::any_of(dest.getUsers(), [isDeallocLikeOp, op](Operation *user) { + return user != op && !isDeallocLikeOp(user); + })) + return failure(); + + ValueRange asyncDependencies = op.asyncDependencies(); + // Check that the async token we are going to erase has no other uses. + if (!op.asyncToken() || op.asyncToken().use_empty()) + rewriter.eraseOp(op); + + // Remove redundant gpu.wait op. If `op` has a single async dependency + // token, and the token value has a single user (other than `op`, of type + // gpu.wait, we can erase the gpu.wait op, along with the op defining the + // async token. + if (asyncDependencies.size() == 1 && asyncDependencies[0].hasOneUse()) { + if (auto waitOp = dyn_cast(*asyncDependencies[0].user_begin())) { + if (!waitOp.asyncToken() && waitOp.asyncDependencies().size() == 1) { + rewriter.eraseOp(waitOp); + rewriter.eraseOp(asyncDependencies[0].getDefiningOp()); + } + } + } + return success(); + } +}; + +} // end anonymous namespace + +void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} + //===----------------------------------------------------------------------===// // GPU_SubgroupMmaLoadMatrixOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -1,5 +1,28 @@ // RUN: mlir-opt %s -canonicalize --split-input-file -allow-unregistered-dialect | FileCheck %s +// CHECK-LABEL: func @fold_memcpy_op +func @fold_memcpy_op(%arg0: i1) { + %cst = arith.constant 3.343820e-05 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %1 = memref.alloc() : memref<400x1024x1024x1xf16> + %2 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%2] () : memref<400x1024x1024x1xf16> + gpu.wait [%2] + affine.store %cst, %memref[0, 0, 0, 0] : memref<400x1024x1024x1xf16> + %3 = gpu.wait async + %4 = gpu.memcpy async [%3] %1, %memref : memref<400x1024x1024x1xf16>, memref<400x1024x1024x1xf16> + gpu.wait [%3] + %5 = scf.if %arg0 -> (i1) { + memref.dealloc %1 : memref<400x1024x1024x1xf16> + scf.yield %arg0 : i1 + } else { + memref.dealloc %1 : memref<400x1024x1024x1xf16> + scf.yield %arg0 : i1 + } + return +} +// CHECK-NOT: gpu.memcpy + // CHECK-LABEL: @memcpy_after_cast func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { // CHECK-NOT: memref.cast