diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -1007,6 +1007,7 @@ }]; let hasFolder = 1; let hasVerifier = 1; + let hasCanonicalizer = 1; } def GPU_MemsetOp : GPU_Op<"memset", diff --git a/mlir/include/mlir/Interfaces/SideEffectInterfaces.h b/mlir/include/mlir/Interfaces/SideEffectInterfaces.h --- a/mlir/include/mlir/Interfaces/SideEffectInterfaces.h +++ b/mlir/include/mlir/Interfaces/SideEffectInterfaces.h @@ -248,6 +248,10 @@ // SideEffect Utilities //===----------------------------------------------------------------------===// +/// Returns true if this operation only has the given effect on `value`. +template +bool hasSingleEffect(Operation *op, Value value); + /// Return true if the given operation is unused, and has no side effects on /// memory that prevent erasing. bool isOpTriviallyDead(Operation *op); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -24,6 +24,7 @@ #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/TypeSwitch.h" @@ -1105,6 +1106,48 @@ return success(); } +namespace { + +/// Erases a common case of copy ops where a destination value is used only by +/// the copy op, alloc and dealloc ops. +struct EraseTrivialCopyOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(MemcpyOp op, + PatternRewriter &rewriter) const override { + Value dest = op.dst(); + Operation *destDefOp = dest.getDefiningOp(); + // `dest` must be defined by an op having Allocate memory effect in order to + // perform the folding. + if (!destDefOp || + !hasSingleEffect(destDefOp, dest)) + return failure(); + // We can erase `op` iff `dest` has no other use apart from its + // use by `op` and dealloc ops. + if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) { + return user != op && + !hasSingleEffect(user, dest); + })) + return failure(); + // We can perform the folding if and only if op has a single async + // dependency and produces an async token as result, or if it does not have + // any async dependency and does not produce any async token result. + if (op.asyncDependencies().size() > 1 || + ((op.asyncDependencies().empty() && op.asyncToken()) || + (!op.asyncDependencies().empty() && !op.asyncToken()))) + return failure(); + rewriter.replaceOp(op, op.asyncDependencies()); + return success(); + } +}; + +} // end anonymous namespace + +void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} + //===----------------------------------------------------------------------===// // GPU_SubgroupMmaLoadMatrixOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/SideEffectInterfaces.cpp b/mlir/lib/Interfaces/SideEffectInterfaces.cpp --- a/mlir/lib/Interfaces/SideEffectInterfaces.cpp +++ b/mlir/lib/Interfaces/SideEffectInterfaces.cpp @@ -90,6 +90,33 @@ return true; } +template +bool mlir::hasSingleEffect(Operation *op, Value value) { + auto memOp = dyn_cast(op); + if (!memOp) + return false; + SmallVector, 4> effects; + memOp.getEffects(effects); + bool doesOpOnlyHaveSingleEffectOnVal = false; + // Iterate through `effects` and check if and only if effect of type + // `EffectTy` is present. + for (auto &effect : effects) { + if (effect.getValue() == value && isa(effect.getEffect())) + doesOpOnlyHaveSingleEffectOnVal = true; + if (effect.getValue() == value && !isa(effect.getEffect())) { + doesOpOnlyHaveSingleEffectOnVal = false; + break; + } + } + return doesOpOnlyHaveSingleEffectOnVal; +} + +template bool mlir::hasSingleEffect(Operation *, + Value); +template bool mlir::hasSingleEffect(Operation *, Value); +template bool mlir::hasSingleEffect(Operation *, Value); +template bool mlir::hasSingleEffect(Operation *, Value); + bool mlir::wouldOpBeTriviallyDead(Operation *op) { if (op->mightHaveTrait()) return false; diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -28,6 +28,70 @@ // CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] () // CHECK-NEXT: return +// CHECK-LABEL: func @fold_memcpy_op +func.func @fold_memcpy_op(%arg0: i1) { + %cst = arith.constant 0.000000e+00 : f16 + %1 = memref.alloc() : memref<2xf16> + %2 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> + gpu.wait [%2] + affine.store %cst, %memref[0] : memref<2xf16> + %3 = gpu.wait async + %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16> + gpu.wait [%3] + %5 = scf.if %arg0 -> (i1) { + memref.dealloc %1 : memref<2xf16> + scf.yield %arg0 : i1 + } else { + memref.dealloc %1 : memref<2xf16> + scf.yield %arg0 : i1 + } + return +} +// CHECK-NOT: gpu.memcpy + +// We cannot fold memcpy here as dest is a block argument. +// CHECK-LABEL: func @do_not_fold_memcpy_op1 +func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) { + %cst = arith.constant 0.000000e+00 : f16 + %2 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> + gpu.wait [%2] + affine.store %cst, %memref[0] : memref<2xf16> + %3 = gpu.wait async + %4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16> + gpu.wait [%3] + return +} +// CHECK: gpu.memcpy + +// We cannot fold gpu.memcpy as it is used by an op having read effect on dest. +// CHECK-LABEL: func @do_not_fold_memcpy_op2 +func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 { + %cst = arith.constant 0.000000e+00 : f16 + %1 = memref.alloc() : memref<2xf16> + %2 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> + gpu.wait [%2] + affine.store %cst, %memref[0] : memref<2xf16> + %3 = gpu.wait async + %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16> + gpu.wait [%3] + %5 = memref.load %1[%arg1] : memref<2xf16> + return %5 : f16 +} +// CHECK: gpu.memcpy + +// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op. +// CHECK-LABEL: func @do_not_fold_memcpy_op3 +func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref) { + %0 = arith.constant 0 : index + %1 = memref.view %arg0[%0][] : memref<1xi8> to memref + gpu.memcpy %1, %arg1 : memref, memref + func.return +} +// CHECK: gpu.memcpy + // CHECK-LABEL: @memcpy_after_cast func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { // CHECK-NOT: memref.cast