diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -291,12 +291,14 @@ let parser = [{ return parseGPUFuncOp(parser, result); }]; } -def GPU_LaunchFuncOp : GPU_Op<"launch_func">, - Arguments<(ins SymbolRefAttr:$kernel, +def GPU_LaunchFuncOp : GPU_Op<"launch_func", + [GPU_AsyncOpInterface, AttrSizedOperandSegments]>, + Arguments<(ins Variadic:$asyncDependencies, + SymbolRefAttr:$kernel, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Variadic:$operands)>, - Results<(outs)> { + Results<(outs Optional:$asyncToken)> { let summary = "Launches a function as a GPU kernel"; let description = [{ @@ -308,14 +310,22 @@ function is required to be a gpu.module. And finally, the module containing the kernel module (which thus cannot be the top-level module) is required to have the `gpu.container_module` attribute. The `gpu.launch_func` - operation has a symbol attribute named `kernel` to identify the fully + operation has a symbol attribute named `kernel` to identify the fully specified kernel function to launch (both the gpu.module and func). - The operation takes at least six operands, with the first three operands - being grid sizes along x,y,z dimensions and the following three being block - sizes along x,y,z dimensions. When a lower-dimensional kernel is required, - unused sizes must be explicitly set to `1`. The remaining operands are - passed as arguments to the kernel function. + The `gpu.launch_func` supports async dependencies: the kernel does not start + executing until the ops producing those async dependencies have completed. + + By the default, the host implicitly blocks until kernel execution has + completed. If the `async` keyword is present, the host does not block but + instead a `!gpu.async.token` is returned. Other async GPU ops can take this + token as dependency. + + The operation requires at least the grid and block sizes along the x,y,z + dimensions as arguments. When a lower-dimensional kernel is required, + unused sizes must be explicitly set to `1`. + + The remaining operands are passed as arguments to the kernel function. Example: @@ -351,11 +361,15 @@ } } - gpu.launch_func - @kernels::@kernel_1 // Kernel module and function. - grid in (%cst, %cst, %cst) // Grid sizes. - block in (%cst, %cst, %cst) // Block sizes. - (%arg0 : f32, %arg1 : memref) // Arguments passed to the kernel. + %t0 = gpu.wait async + %t1 = gpu.launch_func + async // (Optional) Don't block host, return token. + [%t0] // (Optional) Execute only after %t0 has completed. + @kernels::@kernel_1 // Kernel module and function. + grid in (%cst, %cst, %cst) // Grid sizes. + block in (%cst, %cst, %cst) // Block sizes. + (%arg0 : f32, // Arguments passed to the kernel. + %arg1 : memref) } ``` }]; @@ -403,6 +417,7 @@ let verifier = [{ return ::verify(*this); }]; let assemblyFormat = [{ $kernel + custom(type($asyncToken), $asyncDependencies) `grid` `in` custom() `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` `block` `in` custom() `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` custom($operands, type($operands)) @@ -748,4 +763,46 @@ let verifier = [{ return success(); }]; } +def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { + let summary = "Wait for async gpu ops to complete."; + let description = [{ + This op synchronizes the host or the device with a list of dependent ops. + + If the op contains the `async` keyword, it returns a new async token which + is synchronized with the op arguments. This new token is merely a shortcut + to the argument list, and one could replace the uses of the result with the + arguments for the same effect. The async version of this op is primarily + used to make each async token have a single use during lowering and + thereby make forks in async execution explicit. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + %t2 = gpu.wait async [%t0, %t1] + // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just + // as if the async dependencies were [%t0, %t1]. + %t3 = gpu.baz async [%t2] + ``` + + If the op does not contain the `async` keyword, it does not return a new + async token but blocks until all ops producing the async dependency tokens + finished execution. All dependent memory operations are visible to the host + once this op completes. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. + gpu.wait [%t0, %t1] + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) attr-dict + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -438,10 +438,15 @@ auto kernelSymbol = builder.getSymbolRefAttr( kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())}); result.addAttribute(getKernelAttrName(), kernelSymbol); + SmallVector segmentSizes(8, 1); + segmentSizes.front() = 0; // Initially no async dependencies. + segmentSizes.back() = static_cast(kernelOperands.size()); + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getI32VectorAttr(segmentSizes)); } unsigned LaunchFuncOp::getNumKernelOperands() { - return getNumOperands() - kNumConfigOperands; + return getNumOperands() - asyncDependencies().size() - kNumConfigOperands; } StringRef LaunchFuncOp::getKernelModuleName() { @@ -451,15 +456,17 @@ StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); } Value LaunchFuncOp::getKernelOperand(unsigned i) { - return getOperation()->getOperand(i + kNumConfigOperands); + return getOperand(asyncDependencies().size() + kNumConfigOperands + i); } KernelDim3 LaunchFuncOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; } static LogicalResult verify(LaunchFuncOp op) { @@ -840,6 +847,30 @@ /*printBlockTerminators=*/false); } +static ParseResult parseAsyncDependencies( + OpAsmParser &parser, Type &asyncTokenType, + SmallVectorImpl &asyncDependencies) { + auto loc = parser.getCurrentLocation(); + if (succeeded(parser.parseOptionalKeyword("async"))) { + if (parser.getNumResults() == 0) + return parser.emitError(loc, "needs to be named when marked 'async'"); + asyncTokenType = parser.getBuilder().getType(); + } + return parser.parseOperandList(asyncDependencies, + OpAsmParser::Delimiter::OptionalSquare); +} + +static void printAsyncDependencies(OpAsmPrinter &printer, Type asyncTokenType, + OperandRange asyncDependencies) { + if (asyncTokenType) + printer << "async "; + if (asyncDependencies.empty()) + return; + printer << "["; + llvm::interleaveComma(asyncDependencies, printer); + printer << "]"; +} + #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc" #define GET_OP_CLASSES diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -21,6 +21,7 @@ %c8 = constant 8 : index %c32 = constant 32 : i32 "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8, %c32, %buffer) { + operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 2]> : vector<8xi32>, kernel = @kernel_module::@kernel } : (index, index, index, index, index, index, i32, memref) -> () return diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -37,6 +37,7 @@ func @launch_func_too_few_operands(%sz : index) { // expected-error@+1 {{expected 6 or more operands}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>} : (index, index, index, index, index) -> () return } @@ -55,6 +56,7 @@ func @launch_func_missing_callee_attribute(%sz : index) { // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> () return } @@ -428,3 +430,17 @@ } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 3: i64} : () -> () } } + +// ----- + +func @sync_wait_with_result() { + // expected-error @+1 {{cannot name an operation with no results}} + %t = gpu.wait +} + +// ----- + +func @async_wait_without_result() { + // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}} + gpu.wait async +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -73,6 +73,7 @@ %1 = "op"() : () -> (memref) // CHECK: %{{.*}} = constant 8 %cst = constant 8 : index + %t0 = gpu.wait async // CHECK: gpu.launch_func @kernels::@kernel_1 grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref) @@ -80,6 +81,9 @@ // CHECK: gpu.launch_func @kernels::@kernel_2 grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_2 grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref) + // CHECK: %{{.*}} = gpu.launch_func @kernels::@kernel_1 async [%{{.*}}] grid in (%{{.*}}, %{{.*}}, %{{.*}}) block in (%{{.*}}, %{{.*}}, %{{.*}}) (%{{.*}} : f32, %{{.*}} : memref) + %t1 = gpu.launch_func @kernels::@kernel_1 async [%t0] grid in (%cst, %cst, %cst) block in (%cst, %cst, %cst) (%0 : f32, %1 : memref) + return } @@ -145,4 +149,21 @@ // CHECK: return {{.*}} : !gpu.async.token return %arg0 : !gpu.async.token } + + func @async_wait() { + // CHECK-LABEL: func @async_wait + // CHECK: %[[t0:.*]] = gpu.wait async + %0 = gpu.wait async + // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]] + %1 = gpu.wait async [%0] + // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]] + %2 = gpu.wait async [%0, %1] + // CHECK: gpu.wait [%[[t0]], %[[t1]]] + // CHECK-NOT: async + gpu.wait [%0, %1] + // CHECK: gpu.wait + // CHECK-NOT: async + gpu.wait // Valid, but a no-op. + return + } }