diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -291,12 +291,14 @@ let parser = [{ return parseGPUFuncOp(parser, result); }]; } -def GPU_LaunchFuncOp : GPU_Op<"launch_func">, - Arguments<(ins SymbolRefAttr:$kernel, +def GPU_LaunchFuncOp : GPU_Op<"launch_func", + [GPU_AsyncOpInterface, AttrSizedOperandSegments]>, + Arguments<(ins Variadic:$asyncDependencies, + SymbolRefAttr:$kernel, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Variadic:$operands)>, - Results<(outs)> { + Results<(outs Optional:$asyncToken)> { let summary = "Launches a function as a GPU kernel"; let description = [{ @@ -308,14 +310,22 @@ function is required to be a gpu.module. And finally, the module containing the kernel module (which thus cannot be the top-level module) is required to have the `gpu.container_module` attribute. The `gpu.launch_func` - operation has a symbol attribute named `kernel` to identify the fully + operation has a symbol attribute named `kernel` to identify the fully specified kernel function to launch (both the gpu.module and func). - The operation takes at least six operands, with the first three operands - being grid sizes along x,y,z dimensions and the following three being block - sizes along x,y,z dimensions. When a lower-dimensional kernel is required, - unused sizes must be explicitly set to `1`. The remaining operands are - passed as arguments to the kernel function. + The `gpu.launch_func` supports async dependencies: the kernel does not start + executing until the ops producing those async dependencies have completed. + + By the default, the host implicitly blocks until kernel execution has + completed. If the `async` keyword is present, the host does not block but + instead a `!gpu.async.token` is returned. Other async GPU ops can take this + token as dependency. + + The operation requires at least the grid and block sizes along the x,y,z + dimensions as arguments. When a lower-dimensional kernel is required, + unused sizes must be explicitly set to `1`. + + The remaining operands are passed as arguments to the kernel function. Example: @@ -351,11 +361,15 @@ } } + %t0 = gpu.wait async gpu.launch_func - @kernels::@kernel_1 // Kernel function. - blocks in (%cst, %cst, %cst) // Grid size. - threads in (%cst, %cst, %cst) // Block size. - args(%arg0 : f32, %arg1 : memref) // Kernel arguments. + async // (Optional) Don't block host, return token. + [%t0] // (Optional) Execute only after %t0 has completed. + @kernels::@kernel_1 // Kernel function. + blocks in (%cst, %cst, %cst) // Grid size. + threads in (%cst, %cst, %cst) // Block size. + args(%arg0 : f32, // (Optional) Kernel arguments. + %arg1 : memref) } ``` }]; @@ -402,6 +416,7 @@ let verifier = [{ return ::verify(*this); }]; let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $kernel `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -438,10 +438,15 @@ auto kernelSymbol = builder.getSymbolRefAttr( kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())}); result.addAttribute(getKernelAttrName(), kernelSymbol); + SmallVector segmentSizes(8, 1); + segmentSizes.front() = 0; // Initially no async dependencies. + segmentSizes.back() = static_cast(kernelOperands.size()); + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getI32VectorAttr(segmentSizes)); } unsigned LaunchFuncOp::getNumKernelOperands() { - return getNumOperands() - kNumConfigOperands; + return getNumOperands() - asyncDependencies().size() - kNumConfigOperands; } StringRef LaunchFuncOp::getKernelModuleName() { @@ -451,15 +456,17 @@ StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); } Value LaunchFuncOp::getKernelOperand(unsigned i) { - return getOperation()->getOperand(i + kNumConfigOperands); + return getOperand(asyncDependencies().size() + kNumConfigOperands + i); } KernelDim3 LaunchFuncOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; } static LogicalResult verify(LaunchFuncOp op) { diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -20,9 +20,10 @@ func @foo(%buffer: memref) { %c8 = constant 8 : index %c32 = constant 32 : i32 - "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8, %c32, %buffer) { - kernel = @kernel_module::@kernel - } : (index, index, index, index, index, index, i32, memref) -> () + gpu.launch_func @kernel_module::@kernel + blocks in (%c8, %c8, %c8) + threads in (%c8, %c8, %c8) + args(%c32 : i32, %buffer : memref) return } diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir --- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir @@ -3,7 +3,8 @@ module attributes {gpu.container_module} { func @builtin() { %c0 = constant 1 : index - "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_x} : (index, index, index, index, index, index) -> () + gpu.launch_func @kernels::@builtin_workgroup_id_x + blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) return } @@ -26,7 +27,8 @@ module attributes {gpu.container_module} { func @builtin() { %c0 = constant 1 : index - "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_y} : (index, index, index, index, index, index) -> () + gpu.launch_func @kernels::@builtin_workgroup_id_y + blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) return } @@ -49,7 +51,8 @@ module attributes {gpu.container_module} { func @builtin() { %c0 = constant 1 : index - "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @kernels::@builtin_workgroup_id_z} : (index, index, index, index, index, index) -> () + gpu.launch_func @kernels::@builtin_workgroup_id_z + blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) return } diff --git a/mlir/test/Conversion/GPUToSPIRV/if.mlir b/mlir/test/Conversion/GPUToSPIRV/if.mlir --- a/mlir/test/Conversion/GPUToSPIRV/if.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/if.mlir @@ -7,7 +7,9 @@ } { func @main(%arg0 : memref<10xf32>, %arg1 : i1) { %c0 = constant 1 : index - "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = @kernels::@kernel_simple_selection} : (index, index, index, index, index, index, memref<10xf32>, i1) -> () + gpu.launch_func @kernels::@kernel_simple_selection + blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) + args(%arg0 : memref<10xf32>, %arg1 : i1) return } diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -15,7 +15,10 @@ %1 = subi %c4, %c0_0 : index %c1_1 = constant 1 : index %c1_2 = constant 1 : index - "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = @kernels::@load_store_kernel} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> () + gpu.launch_func @kernels::@load_store_kernel + blocks in (%0, %c1_2, %c1_2) threads in (%1, %c1_2, %c1_2) + args(%arg0 : memref<12x4xf32>, %arg1 : memref<12x4xf32>, %arg2 : memref<12x4xf32>, + %c0 : index, %c0_0 : index, %c1 : index, %c1_1 : index) return } diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -7,7 +7,9 @@ } { func @loop(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) { %c0 = constant 1 : index - "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = @kernels::@loop_kernel} : (index, index, index, index, index, index, memref<10xf32>, memref<10xf32>) -> () + gpu.launch_func @kernels::@loop_kernel + blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) + args(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) return } diff --git a/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir b/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir --- a/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/module-structure-opencl.mlir @@ -22,8 +22,9 @@ %0 = "op"() : () -> (f32) %1 = "op"() : () -> (memref<12xf32, 11>) %cst = constant 1 : index - "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@basic_module_structure } - : (index, index, index, index, index, index, f32, memref<12xf32, 11>) -> () + gpu.launch_func @kernels::@basic_module_structure + blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + args(%0 : f32, %1 : memref<12xf32, 11>) return } } diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -18,8 +18,9 @@ %0 = "op"() : () -> (f32) %1 = "op"() : () -> (memref<12xf32>) %cst = constant 1 : index - "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@basic_module_structure } - : (index, index, index, index, index, index, f32, memref<12xf32>) -> () + gpu.launch_func @kernels::@basic_module_structure + blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + args(%0 : f32, %1 : memref<12xf32>) return } } @@ -63,8 +64,9 @@ %0 = "op"() : () -> (f32) %1 = "op"() : () -> (memref<12xf32>) %cst = constant 1 : index - "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernels::@missing_entry_point_abi } - : (index, index, index, index, index, index, f32, memref<12xf32>) -> () + gpu.launch_func @kernels::@missing_entry_point_abi + blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + args(%0 : f32, %1 : memref<12xf32>) return } } diff --git a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir --- a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir +++ b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir @@ -26,7 +26,10 @@ func @foo() { %0 = alloc() : memref<12xf32> %c1 = constant 1 : index - "gpu.launch_func"(%c1, %c1, %c1, %c1, %c1, %c1, %0) {kernel = @kernels::@kernel} : (index, index, index, index, index, index, memref<12xf32>) -> () + gpu.launch_func @kernels::@kernel + blocks in(%c1, %c1, %c1) + threads in(%c1, %c1, %c1) + args(%0 : memref<12xf32>) -> () return } } diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -37,6 +37,7 @@ func @launch_func_too_few_operands(%sz : index) { // expected-error@+1 {{expected 6 or more operands}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>} : (index, index, index, index, index) -> () return } @@ -55,6 +56,7 @@ func @launch_func_missing_callee_attribute(%sz : index) { // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> () return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -73,6 +73,7 @@ %1 = "op"() : () -> (memref) // CHECK: %{{.*}} = constant 8 %cst = constant 8 : index + %t0 = gpu.wait async // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref) @@ -80,6 +81,9 @@ // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) + %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + return } diff --git a/mlir/test/mlir-vulkan-runner/addf.mlir b/mlir/test/mlir-vulkan-runner/addf.mlir --- a/mlir/test/mlir-vulkan-runner/addf.mlir +++ b/mlir/test/mlir-vulkan-runner/addf.mlir @@ -37,8 +37,9 @@ %cst1 = constant 1 : index %cst8 = constant 8 : index - "gpu.launch_func"(%cst8, %cst1, %cst1, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_add } - : (index, index, index, index, index, index, memref<8xf32>, memref<8xf32>, memref<8xf32>) -> () + gpu.launch_func @kernels::@kernel_add + blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1) + args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>) %arg6 = memref_cast %arg5 : memref to memref<*xf32> call @print_memref_f32(%arg6) : (memref<*xf32>) -> () return diff --git a/mlir/test/mlir-vulkan-runner/addi.mlir b/mlir/test/mlir-vulkan-runner/addi.mlir --- a/mlir/test/mlir-vulkan-runner/addi.mlir +++ b/mlir/test/mlir-vulkan-runner/addi.mlir @@ -36,8 +36,9 @@ %cst1 = constant 1 : index %cst8 = constant 8 : index - "gpu.launch_func"(%cst8, %cst8, %cst8, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_addi } - : (index, index, index, index, index, index, memref<8xi32>, memref<8x8xi32>, memref<8x8x8xi32>) -> () + gpu.launch_func @kernels::@kernel_addi + blocks in (%cst8, %cst8, %cst8) threads in (%cst1, %cst1, %cst1) + args(%arg0 : memref<8xi32>, %arg1 : memref<8x8xi32>, %arg2 : memref<8x8x8xi32>) %arg6 = memref_cast %arg5 : memref to memref<*xi32> call @print_memref_i32(%arg6) : (memref<*xi32>) -> () return diff --git a/mlir/test/mlir-vulkan-runner/addi8.mlir b/mlir/test/mlir-vulkan-runner/addi8.mlir --- a/mlir/test/mlir-vulkan-runner/addi8.mlir +++ b/mlir/test/mlir-vulkan-runner/addi8.mlir @@ -37,8 +37,9 @@ %cst1 = constant 1 : index %cst8 = constant 8 : index - "gpu.launch_func"(%cst8, %cst8, %cst8, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_addi } - : (index, index, index, index, index, index, memref<8xi8>, memref<8x8xi8>, memref<8x8x8xi32>) -> () + gpu.launch_func @kernels::@kernel_addi + blocks in (%cst8, %cst8, %cst8) threads in (%cst1, %cst1, %cst1) + args(%arg0 : memref<8xi8>, %arg1 : memref<8x8xi8>, %arg2 : memref<8x8x8xi32>) %arg6 = memref_cast %arg5 : memref to memref<*xi32> call @print_memref_i32(%arg6) : (memref<*xi32>) -> () return diff --git a/mlir/test/mlir-vulkan-runner/mulf.mlir b/mlir/test/mlir-vulkan-runner/mulf.mlir --- a/mlir/test/mlir-vulkan-runner/mulf.mlir +++ b/mlir/test/mlir-vulkan-runner/mulf.mlir @@ -38,8 +38,9 @@ %cst1 = constant 1 : index %cst4 = constant 4 : index - "gpu.launch_func"(%cst4, %cst4, %cst1, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_mul } - : (index, index, index, index, index, index, memref<4x4xf32>, memref<4x4xf32>, memref<4x4xf32>) -> () + gpu.launch_func @kernels::@kernel_mul + blocks in (%cst4, %cst4, %cst1) threads in(%cst1, %cst1, %cst1) + args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<4x4xf32>) %arg6 = memref_cast %arg5 : memref to memref<*xf32> call @print_memref_f32(%arg6) : (memref<*xf32>) -> () return diff --git a/mlir/test/mlir-vulkan-runner/subf.mlir b/mlir/test/mlir-vulkan-runner/subf.mlir --- a/mlir/test/mlir-vulkan-runner/subf.mlir +++ b/mlir/test/mlir-vulkan-runner/subf.mlir @@ -40,8 +40,9 @@ %cst1 = constant 1 : index %cst4 = constant 4 : index %cst8 = constant 8 : index - "gpu.launch_func"(%cst8, %cst4, %cst4, %cst1, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_sub } - : (index, index, index, index, index, index, memref<8x4x4xf32>, memref<4x4xf32>, memref<8x4x4xf32>) -> () + gpu.launch_func @kernels::@kernel_sub + blocks in (%cst8, %cst4, %cst4) threads in (%cst1, %cst1, %cst1) + args(%arg0 : memref<8x4x4xf32>, %arg1 : memref<4x4xf32>, %arg2 : memref<8x4x4xf32>) %arg6 = memref_cast %arg5 : memref to memref<*xf32> call @print_memref_f32(%arg6) : (memref<*xf32>) -> () return diff --git a/mlir/test/mlir-vulkan-runner/time.mlir b/mlir/test/mlir-vulkan-runner/time.mlir --- a/mlir/test/mlir-vulkan-runner/time.mlir +++ b/mlir/test/mlir-vulkan-runner/time.mlir @@ -44,8 +44,9 @@ %cst1 = constant 1 : index %cst128 = constant 128 : index - "gpu.launch_func"(%cst128, %cst1, %cst1, %cst128, %cst1, %cst1, %arg0, %arg1, %arg2) { kernel = @kernels::@kernel_add } - : (index, index, index, index, index, index, memref<16384xf32>, memref<16384xf32>, memref<16384xf32>) -> () + gpu.launch_func @kernels::@kernel_add + blocks in (%cst128, %cst1, %cst1) threads in (%cst128, %cst1, %cst1) + args(%arg0 : memref<16384xf32>, %arg1 : memref<16384xf32>, %arg2 : memref<16384xf32>) %arg6 = memref_cast %arg5 : memref to memref<*xf32> return }