diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -756,4 +756,46 @@ let verifier = [{ return success(); }]; } +def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { + let summary = "Wait for async gpu ops to complete."; + let description = [{ + This op synchronizes the host or the device with a list of dependent ops. + + If the op contains the `async` keyword, it returns a new async token which + is synchronized with the op arguments. This new token is merely a shortcut + to the argument list, and one could replace the uses of the result with the + arguments for the same effect. The async version of this op is primarily + used to make each async token have a single use during lowering and + thereby make forks in async execution explicit. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + %t2 = gpu.wait async [%t0, %t1] + // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just + // as if the async dependencies were [%t0, %t1]. + %t3 = gpu.baz async [%t2] + ``` + + If the op does not contain the `async` keyword, it does not return a new + async token but blocks until all ops producing the async dependency tokens + finished execution. All dependent memory operations are visible to the host + once this op completes. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. + gpu.wait [%t0, %t1] + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) attr-dict + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -818,6 +818,30 @@ /*printBlockTerminators=*/false); } +static ParseResult parseAsyncDependencies( + OpAsmParser &parser, Type &asyncTokenType, + SmallVectorImpl &asyncDependencies) { + auto loc = parser.getCurrentLocation(); + if (succeeded(parser.parseOptionalKeyword("async"))) { + if (parser.getNumResults() == 0) + return parser.emitError(loc, "needs to be named when marked 'async'"); + asyncTokenType = parser.getBuilder().getType(); + } + return parser.parseOperandList(asyncDependencies, + OpAsmParser::Delimiter::OptionalSquare); +} + +static void printAsyncDependencies(OpAsmPrinter &printer, Type asyncTokenType, + OperandRange asyncDependencies) { + if (asyncTokenType) + printer << "async "; + if (asyncDependencies.empty()) + return; + printer << "["; + llvm::interleaveComma(asyncDependencies, printer); + printer << "]"; +} + #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc" #define GET_OP_CLASSES diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -435,3 +435,17 @@ } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 3: i64} : () -> () } } + +// ----- + +func @sync_wait_with_result() { + // expected-error @+1 {{cannot name an operation with no results}} + %t = gpu.wait +} + +// ----- + +func @async_wait_without_result() { + // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}} + gpu.wait async +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -149,4 +149,21 @@ // CHECK: return {{.*}} : !gpu.async.token return %arg0 : !gpu.async.token } + + func @async_wait() { + // CHECK-LABEL: func @async_wait + // CHECK: %[[t0:.*]] = gpu.wait async + %0 = gpu.wait async + // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]] + %1 = gpu.wait async [%0] + // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]] + %2 = gpu.wait async [%0, %1] + // CHECK: gpu.wait [%[[t0]], %[[t1]]] + // CHECK-NOT: async + gpu.wait [%0, %1] + // CHECK: gpu.wait + // CHECK-NOT: async + gpu.wait // Valid, but a no-op. + return + } }