diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1052,6 +1052,38 @@ let hasFolder = 1; } +def GPU_MemzeroOp : GPU_Op<"memzero", [GPU_AsyncOpInterface]> { + + let summary = "GPU memzero operation"; + + let description = [{ + The `gpu.memzero` operation sets the content of memref to zero value. + + The op does not execute before all async dependencies have finished + executing. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token. + + Example: + + ```mlir + %token = gpu.memzero async [%dep] %dst : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$dst); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $dst `:` type($dst) attr-dict + }]; + let hasFolder = 1; +} + def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device", [MemoryEffects<[MemWrite]>]>, Arguments<(ins I32:$devIndex)> { diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1268,6 +1268,11 @@ return foldMemRefCast(*this); } +LogicalResult MemzeroOp::fold(ArrayRef operands, + SmallVectorImpl<::mlir::OpFoldResult> &results) { + return foldMemRefCast(*this); +} + //===----------------------------------------------------------------------===// // GPU_WaitOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir --- a/mlir/test/Dialect/GPU/canonicalize.mlir +++ b/mlir/test/Dialect/GPU/canonicalize.mlir @@ -111,6 +111,15 @@ return } +// CHECK-LABEL: @memzero_after_cast +func.func @memzero_after_cast(%arg0: memref<10xf32>) { + // CHECK-NOT: memref.cast + // CHECK: gpu.memzero + %0 = memref.cast %arg0 : memref<10xf32> to memref + gpu.memzero %0 : memref + return +} + // ----- // Test case: Folding of memref.dim(gpu.alloc(%size), %idx) -> %size diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -257,6 +257,17 @@ return } + func.func @memzero(%dst : memref<3x7xf32>) { + // CHECK-LABEL: func @memzero + // CHECK: gpu.memzero {{.*}} : memref<3x7xf32> + gpu.memzero %dst : memref<3x7xf32> + // CHECK: %[[t0:.*]] = gpu.wait async + %0 = gpu.wait async + // CHECK: {{.*}} = gpu.memzero async [%[[t0]]] {{.*}} : memref<3x7xf32> + %1 = gpu.memzero async [%0] %dst : memref<3x7xf32> + return + } + func.func @mmamatrix_valid_element_type(%src : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>>){ // CHECK-LABEL: func @mmamatrix_valid_element_type %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>