diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -861,6 +861,8 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) attr-dict }]; + + let hasCanonicalizer = 1; } def GPU_AllocOp : GPU_Op<"alloc", [ diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1222,6 +1222,89 @@ return foldMemRefCast(*this); } +//===----------------------------------------------------------------------===// +// GPU_WaitOp +//===----------------------------------------------------------------------===// + +namespace { + +/// Fold away redundant gpu.wait ops of the following pattern. +/// %t = gpu.wait async +/// gpu.wait [%t] +struct EraseRedundantGpuWaitOpPairs : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(WaitOp op, + PatternRewriter &rewriter) const final { + // We check whether `op` produce result `asyncToken`. + Value token = op.asyncToken(); + if (!token) + return failure(); + // We check whether `op` has any async dependencies or not. + if (!op.asyncDependencies().empty()) + return failure(); + // If token do not have single use, we cannot fold away gpu.wait ops. + if (!token.hasOneUse()) + return failure(); + // If the only op operating on `token` is not a gpu.wait op, we cannot fold + // away gpu.wait ops. + auto tokenUser = dyn_cast(*token.user_begin()); + if (!tokenUser) + return failure(); + // If `waitOp` produces any token, we cannot fold away the gpu.wait ops. + if (tokenUser.asyncToken()) + return failure(); + // `waitOp` should have only single async dependency. + if (!llvm::hasSingleElement(tokenUser.asyncDependencies())) + return failure(); + + rewriter.eraseOp(tokenUser); + rewriter.eraseOp(op); + return success(); + } +}; + +// clang-format off +/// Simplify trivial gpu.wait ops for the following patterns. +/// 1. %t = gpu.wait async ... ops, where %t has no uses. +/// 2. gpu.wait [] ops, i.e gpu.wait ops that neither have any async dependencies nor return any token. +/// 3. %t1 = gpu.wait async [%t0], in this case, we can replace uses of %t1 with %t0. +// clang-format on +struct SimplifyGpuWaitOp : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(WaitOp op, + PatternRewriter &rewriter) const final { + // Erase gpu.wait [] ops, i.e gpu.wait ops which neither have any async + // dependencies nor return any token. + if (op.asyncDependencies().empty() && !op.asyncToken()) { + rewriter.eraseOp(op); + return success(); + } + // Erase %t1 = gpu.wait async [%t0] ops. In this case, we can replace uses + // of %t1 with %t0. + if (llvm::hasSingleElement(op.asyncDependencies()) && op.asyncToken()) { + rewriter.replaceOp(op, op.asyncDependencies()); + return success(); + } + // Erase %t = gpu.wait async ... ops, where %t has no uses. + if (op.asyncToken() && op.asyncToken().use_empty()) { + rewriter.eraseOp(op); + return success(); + } + return failure(); + } +}; + +} // end anonymous namespace + +void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} + //===----------------------------------------------------------------------===// // GPU_AllocOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -23,6 +23,33 @@ } // CHECK-NOT: gpu.memcpy +// Fold all the gpu.wait ops as they are redundant. +// CHECK-LABEL: func @fold_wait_op_test1 +func @fold_wait_op_test1() { + %1 = gpu.wait async + gpu.wait [] + %3 = gpu.wait async + gpu.wait [%3] + return +} +// CHECK-NOT: gpu.wait + +// Replace uses of gpu.wait op with its async dependency. +// CHECK-LABEL: func @fold_wait_op_test2 +func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) { + %0 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%0] () : memref<5xf16> + gpu.wait [%0] + %1 = gpu.wait async [%0] + %memref_0, %asyncToken_0 = gpu.alloc async [%1] () : memref<5xf16> + gpu.wait [%1] + return %memref, %memref_0 : memref<5xf16>, memref<5xf16> +} +// CHECK: %[[TOKEN:.*]] = gpu.wait async +// CHECK: gpu.wait [%[[TOKEN]]] +// CHECK-NOT: gpu.wait async +// CHECK: gpu.wait [%[[TOKEN]]] + // CHECK-LABEL: @memcpy_after_cast func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { // CHECK-NOT: memref.cast