diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 8ad3c9abe6cc..593b735f01c9 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -1,792 +1,807 @@ //===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Defines some operations of the GPU dialect. // //===----------------------------------------------------------------------===// #ifndef GPU_OPS #define GPU_OPS include "mlir/Dialect/GPU/GPUBase.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/IR/SymbolInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" //===----------------------------------------------------------------------===// // GPU Dialect operations. //===----------------------------------------------------------------------===// class GPU_Op traits = []> : Op; class GPU_IndexOp traits = []> : GPU_Op, Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> { let verifier = [{ return ::verifyIndexOp(*this); }]; } def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> { let description = [{ Returns the number of threads in the thread block (aka the block size) along the x, y, or z `dimension`. Example: ```mlir %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) ``` }]; } def GPU_BlockIdOp : GPU_IndexOp<"block_id"> { let description = [{ Returns the block id, i.e. the index of the current block within the grid along the x, y, or z `dimension`. Example: ```mlir %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) ``` }]; } def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> { let description = [{ Returns the number of thread blocks in the grid along the x, y, or z `dimension`. Example: ```mlir %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) ``` }]; } def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { let description = [{ Returns the thread id, i.e. the index of the current thread within the block along the x, y, or z `dimension`. Example: ```mlir %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) ``` }]; } def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the subgroup id, i.e. the index of the current subgroup within the workgroup. Example: ```mlir %sgId = gpu.subgroup_id : index ``` }]; let assemblyFormat = "attr-dict `:` type($result)"; let verifier = [{ return success(); }]; } def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [NoSideEffect]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of subgroups within a workgroup. Example: ```mlir %numSg = gpu.num_subgroups : index ``` }]; let assemblyFormat = "attr-dict `:` type($result)"; let verifier = [{ return success(); }]; } def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [NoSideEffect]>, Arguments<(ins)>, Results<(outs Index:$result)> { let description = [{ Returns the number of threads within a subgroup. Example: ```mlir %sgSz = gpu.subgroup_size : index ``` }]; let assemblyFormat = "attr-dict `:` type($result)"; let verifier = [{ return success(); }]; } def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionLike, IsolatedFromAbove, Symbol]> { let summary = "Function executable on a GPU"; let description = [{ Defines a function that can be executed on a GPU. This supports memory attribution and its body has a particular execution model. GPU functions are either kernels (as indicated by the `kernel` attribute) or regular functions. The former can be launched from the host side, while the latter are device side only. The memory attribution defines SSA values that correspond to memory buffers allocated in the memory hierarchy of the GPU (see below). The operation has one attached region that corresponds to the body of the function. The region arguments consist of the function arguments without modification, followed by buffers defined in memory annotations. The body of a GPU function, when launched, is executed by multiple work items. There are no guarantees on the order in which work items execute, or on the connection between them. In particular, work items are not necessarily executed in lock-step. Synchronization ops such as "gpu.barrier" should be used to coordinate work items. Declarations of GPU functions, i.e. not having the body region, are not supported. Syntax: ``` op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` function-result-list)? memory-attribution `kernel`? function-attributes? region memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? (`private` `(` ssa-id-and-type-list `)`)? ``` Example: ```mlir gpu.func @foo(%arg0: index) workgroup(%workgroup: memref<32xf32, 3>) private(%private: memref<1xf32, 5>) kernel attributes {qux: "quux"} { gpu.return } ``` The generic form illustrates the concept ```mlir "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, %private: memref<1xf32, 5>): "gpu.return"() : () -> () }) : (index) -> () ``` Note the non-default memory spaces used in memref types in memory attribution. }]; let regions = (region AnyRegion:$body); let skipDefaultBuilders = 1; let builders = [ OpBuilderDAG<(ins "StringRef":$name, "FunctionType":$type, CArg<"TypeRange", "{}">:$workgroupAttributions, CArg<"TypeRange", "{}">:$privateAttributions, CArg<"ArrayRef", "{}">:$attrs)> ]; let extraClassDeclaration = [{ /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. /// it is intended to be launched from host. bool isKernel() { return getAttrOfType(GPUDialect::getKernelFuncAttrName()) != nullptr; } /// Change the type of this function in place. This is an extremely /// dangerous operation and it is up to the caller to ensure that this is /// legal for this function, and to restore invariants: /// - the entry block args must be updated to match the function params. /// - the argument/result attributes may need an update: if the new type /// has less parameters we drop the extra attributes, if there are more /// parameters they won't have any attributes. // TODO: consider removing this function thanks to rewrite patterns. void setType(FunctionType newType); /// Returns the number of buffers located in the workgroup memory. unsigned getNumWorkgroupAttributions() { return getAttrOfType(getNumWorkgroupAttributionsAttrName()) .getInt(); } /// Returns a list of block arguments that correspond to buffers located in /// the workgroup memory ArrayRef getWorkgroupAttributions() { auto begin = std::next(getBody().args_begin(), getType().getNumInputs()); auto end = std::next(begin, getNumWorkgroupAttributions()); return {begin, end}; } /// Adds a new block argument that corresponds to buffers located in /// workgroup memory. BlockArgument addWorkgroupAttribution(Type type); /// Returns the number of buffers located in the private memory. unsigned getNumPrivateAttributions() { return getBody().getNumArguments() - getType().getNumInputs() - getNumWorkgroupAttributions(); } /// Returns a list of block arguments that correspond to buffers located in /// the private memory. ArrayRef getPrivateAttributions() { // Buffers on the private memory always come after buffers on the workgroup // memory. auto begin = std::next(getBody().args_begin(), getType().getNumInputs() + getNumWorkgroupAttributions()); return {begin, getBody().args_end()}; } /// Adds a new block argument that corresponds to buffers located in /// private memory. BlockArgument addPrivateAttribution(Type type); /// Returns the name of the attribute containing the number of buffers /// located in the workgroup memory. static StringRef getNumWorkgroupAttributionsAttrName() { return "workgroup_attributions"; } // FunctionLike trait needs access to the functions below. friend class OpTrait::FunctionLike; /// Hooks for the input/output type enumeration in FunctionLike . unsigned getNumFuncArguments() { return getType().getNumInputs(); } unsigned getNumFuncResults() { return getType().getNumResults(); } /// Returns the keywords used in the custom syntax for this Op. static StringRef getWorkgroupKeyword() { return "workgroup"; } static StringRef getPrivateKeyword() { return "private"; } static StringRef getKernelKeyword() { return "kernel"; } /// Hook for FunctionLike verifier. LogicalResult verifyType(); /// Verifies the body of the function. LogicalResult verifyBody(); }]; // let verifier = [{ return ::verifFuncOpy(*this); }]; let printer = [{ printGPUFuncOp(p, *this); }]; let parser = [{ return parseGPUFuncOp(parser, result); }]; } -def GPU_LaunchFuncOp : GPU_Op<"launch_func">, - Arguments<(ins SymbolRefAttr:$kernel, +def GPU_LaunchFuncOp : GPU_Op<"launch_func", + [GPU_AsyncOpInterface, AttrSizedOperandSegments]>, + Arguments<(ins Variadic:$asyncDependencies, + SymbolRefAttr:$kernel, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Variadic:$operands)>, - Results<(outs)> { + Results<(outs Optional:$asyncToken)> { let summary = "Launches a function as a GPU kernel"; let description = [{ Launch a kernel function on the specified grid of thread blocks. `gpu.launch` operations are lowered to `gpu.launch_func` operations by outlining the kernel body into a function in a dedicated module, which reflects the separate compilation process. The kernel function is required to have the `gpu.kernel` attribute. The module containing the kernel function is required to be a gpu.module. And finally, the module containing the kernel module (which thus cannot be the top-level module) is required to have the `gpu.container_module` attribute. The `gpu.launch_func` - operation has a symbol attribute named `kernel` to identify the fully + operation has a symbol attribute named `kernel` to identify the fully specified kernel function to launch (both the gpu.module and func). - The operation takes at least six operands, with the first three operands - being grid sizes along x,y,z dimensions and the following three being block - sizes along x,y,z dimensions. When a lower-dimensional kernel is required, - unused sizes must be explicitly set to `1`. The remaining operands are - passed as arguments to the kernel function. + The `gpu.launch_func` supports async dependencies: the kernel does not start + executing until the ops producing those async dependencies have completed. + + By the default, the host implicitly blocks until kernel execution has + completed. If the `async` keyword is present, the host does not block but + instead a `!gpu.async.token` is returned. Other async GPU ops can take this + token as dependency. + + The operation requires at least the grid and block sizes along the x,y,z + dimensions as arguments. When a lower-dimensional kernel is required, + unused sizes must be explicitly set to `1`. + + The remaining operands are passed as arguments to the kernel function. Example: ```mlir module attributes {gpu.container_module} { // This module creates a separate compilation unit for the GPU compiler. gpu.module @kernels { func @kernel_1(%arg0 : f32, %arg1 : memref) attributes { nvvm.kernel = true } { // Operations that produce block/thread IDs and dimensions are // injected when outlining the `gpu.launch` body to a function called // by `gpu.launch_func`. %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index) %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index) %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index) %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index) %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index) %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index) %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index) %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) "some_op"(%bx, %tx) : (index, index) -> () %42 = load %arg1[%bx] : memref } } + %t0 = gpu.wait async gpu.launch_func - @kernels::@kernel_1 // Kernel function. - blocks in (%cst, %cst, %cst) // Grid size. - threads in (%cst, %cst, %cst) // Block size. - args(%arg0 : f32, %arg1 : memref) // Kernel arguments. + async // (Optional) Don't block host, return token. + [%t0] // (Optional) Execute only after %t0 has completed. + @kernels::@kernel_1 // Kernel function. + blocks in (%cst, %cst, %cst) // Grid size. + threads in (%cst, %cst, %cst) // Block size. + args(%arg0 : f32, // (Optional) Kernel arguments. + %arg1 : memref) } ``` }]; let skipDefaultBuilders = 1; let builders = [ OpBuilderDAG<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, "KernelDim3":$blockSize, "ValueRange":$kernelOperands)> ]; let extraClassDeclaration = [{ /// The number of operands passed to the kernel function. unsigned getNumKernelOperands(); /// The name of the kernel's containing module. StringRef getKernelModuleName(); /// The name of the kernel. StringRef getKernelName(); /// The i-th operand passed to the kernel function. Value getKernelOperand(unsigned i); /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); /// The number of launch configuration operands, placed at the leading /// positions of the operand list. static constexpr unsigned kNumConfigOperands = 6; // This needs to quietly verify if attributes with names defined below are // present since it is run before the verifier of this op. friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *, NamedAttribute); /// The name of the symbol reference attribute specifying the kernel to launch. static StringRef getKernelAttrName() { return "kernel"; } }]; let verifier = [{ return ::verify(*this); }]; let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $kernel `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` custom($operands, type($operands)) attr-dict }]; } def GPU_LaunchOp : GPU_Op<"launch">, Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>, Results<(outs)> { let summary = "GPU kernel launch operation"; let description = [{ Launch a kernel on the specified grid of thread blocks. The body of the kernel is defined by the single region that this operation contains. The operation takes six operands, with first three operands being grid sizes along x,y,z dimensions and the following three arguments being block sizes along x,y,z dimension. When a lower-dimensional kernel is required, unused sizes must be explicitly set to `1`. The body region has _twelve_ arguments, grouped as follows: - three arguments that contain block identifiers along x,y,z dimensions; - three arguments that contain thread identifiers along x,y,z dimensions; - operands of the `gpu.launch` operation as is (i.e. the operands for grid and block sizes). Syntax: ``` operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment region attr-dict? ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` ``` Example: ```mlir gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) { // Block and thread identifiers, as well as block/grid sizes are // immediately usable inside body region. "some_op"(%bx, %tx) : (index, index) -> () // Assuming %val1 is defined outside the gpu.launch region. %42 = load %val1[%bx] : memref } // Generic syntax explains how the pretty syntax maps to the IR structure. "gpu.launch"(%cst, %cst, %c1, // Grid sizes. %cst, %c1, %c1) // Block sizes. {/*attributes*/} // All sizes and identifiers have "index" size. : (index, index, index, index, index, index) -> () { // The operation passes block and thread identifiers, followed by grid and // block sizes. ^bb0(%bx : index, %by : index, %bz : index, %tx : index, %ty : index, %tz : index, %num_bx : index, %num_by : index, %num_bz : index, %num_tx : index, %num_ty : index, %num_tz : index) "some_op"(%bx, %tx) : (index, index) -> () %3 = "std.load"(%val1, %bx) : (memref, index) -> f32 } ``` Rationale: using operation/block arguments gives analyses a clear way of understanding that a value has additional semantics (e.g., we will need to know what value corresponds to threadIdx.x for coalescing). We can recover these properties by analyzing the operations producing values, but it is easier just to have that information by construction. }]; let regions = (region AnyRegion:$body); let skipDefaultBuilders = 1; let builders = [ OpBuilderDAG<(ins "Value":$gridSizeX, "Value":$gridSizeY, "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, "Value":$blockSizeZ)> ]; let extraClassDeclaration = [{ /// Get the SSA values corresponding to kernel block identifiers. KernelDim3 getBlockIds(); /// Get the SSA values corresponding to kernel thread identifiers. KernelDim3 getThreadIds(); /// Get the SSA values corresponding to kernel grid size. KernelDim3 getGridSize(); /// Get the SSA values corresponding to kernel block size. KernelDim3 getBlockSize(); /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); static StringRef getBlocksKeyword() { return "blocks"; } static StringRef getThreadsKeyword() { return "threads"; } /// The number of launch configuration operands, placed at the leading /// positions of the operand list. static constexpr unsigned kNumConfigOperands = 6; /// The number of region attributes containing the launch configuration, /// placed in the leading positions of the argument list. static constexpr unsigned kNumConfigRegionAttributes = 12; }]; let parser = [{ return parseLaunchOp(parser, result); }]; let printer = [{ printLaunchOp(p, *this); }]; let verifier = [{ return ::verify(*this); }]; } def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect, Terminator]>, Arguments<(ins Variadic:$operands)>, Results<(outs)> { let summary = "Terminator for GPU functions."; let description = [{ A terminator operation for regions that appear in the body of `gpu.func` functions. The operands to the `gpu.return` are the result values returned by an invocation of the `gpu.func`. }]; let builders = [OpBuilderDAG<(ins), [{ // empty}]>]; let parser = [{ return parseReturnOp(parser, result); }]; let printer = [{ p << getOperationName(); }]; let verifier = [{ return ::verify(*this); }]; } def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, NoSideEffect, Terminator]>, Arguments<(ins)>, Results<(outs)> { let summary = "Terminator for GPU launch regions."; let description = [{ A terminator operation for regions that appear in the body of `gpu.launch` operation. These regions are not expected to return any value so the terminator takes no operands. }]; let parser = [{ return success(); }]; let printer = [{ p << getOperationName(); }]; } def GPU_YieldOp : GPU_Op<"yield", [NoSideEffect, Terminator]>, Arguments<(ins Variadic:$values)> { let summary = "GPU yield operation"; let description = [{ gpu.yield` is a special terminator operation for blocks inside regions in gpu ops. It returns values to the immediately enclosing gpu op. Example: ```mlir gpu.yield %f0, %f1 : f32, f32 ``` }]; } // add, mul mirror the XLA ComparisonDirection enum. def GPU_AllReduceOpAdd : StrEnumAttrCase<"add">; def GPU_AllReduceOpAnd : StrEnumAttrCase<"and">; def GPU_AllReduceOpMax : StrEnumAttrCase<"max">; def GPU_AllReduceOpMin : StrEnumAttrCase<"min">; def GPU_AllReduceOpMul : StrEnumAttrCase<"mul">; def GPU_AllReduceOpOr : StrEnumAttrCase<"or">; def GPU_AllReduceOpXor : StrEnumAttrCase<"xor">; def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr", "built-in reduction operations supported by gpu.allreduce.", [ GPU_AllReduceOpAdd, GPU_AllReduceOpAnd, GPU_AllReduceOpMax, GPU_AllReduceOpMin, GPU_AllReduceOpMul, GPU_AllReduceOpOr, GPU_AllReduceOpXor ]>; def GPU_AllReduceOp : GPU_Op<"all_reduce", [SameOperandsAndResultType, IsolatedFromAbove]>, Arguments<(ins AnyType:$value, OptionalAttr:$op)>, Results<(outs AnyType)> { let summary = "Reduce values among workgroup."; let description = [{ The `all_reduce` op reduces the value of every work item across a local workgroup. The result is equal for all work items of a workgroup. For example, both ```mlir %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32) %2 = "gpu.all_reduce"(%0) ({ ^bb(%lhs : f32, %rhs : f32): %sum = addf %lhs, %rhs : f32 "gpu.yield"(%sum) : (f32) -> () }) : (f32) -> (f32) ``` compute the sum of each work item's %0 value. The first version specifies the accumulation as operation, whereas the second version specifies the accumulation as code region. The accumulation operation must be one of: `add`, `and`, `max`, `min`, `mul`, `or`, `xor`. Either none or all work items of a workgroup need to execute this op in convergence. }]; let regions = (region AnyRegion:$body); let verifier = [{ return ::verifyAllReduce(*this); }]; } def GPU_ShuffleOpXor : StrEnumAttrCase<"xor">; def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr", "Indexing modes supported by gpu.shuffle.", [ GPU_ShuffleOpXor, ]>; def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>, Arguments<(ins AnyType:$value, I32:$offset, I32:$width, GPU_ShuffleModeAttr:$mode)>, Results<(outs AnyType:$result, I1:$valid)> { let summary = "Shuffles values within a subgroup."; let description = [{ The "shuffle" op moves values to a different invocation within the same subgroup. Example: ```mlir %1, %2 = gpu.shuffle %0, %offset, %width xor : f32 ``` For lane k returns the value from lane `k ^ offset` and `true` if that lane is smaller than %width. Otherwise it returns an unspecified value and `false`. A lane is the index of an invocation relative to its subgroup. The width specifies the number of invocations that participate in the shuffle. The width needs to be the same for all invocations that participate in the shuffle. Exactly the first `width` invocations of a subgroup need to execute this op in convergence. }]; let verifier = [{ return ::verifyShuffleOp(*this); }]; let printer = [{ printShuffleOp(p, *this); }]; let parser = [{ return parseShuffleOp(parser, result); }]; } def GPU_BarrierOp : GPU_Op<"barrier"> { let summary = "Synchronizes all work items of a workgroup."; let description = [{ The "barrier" op synchronizes all work items of a workgroup. It is used to coordinate communication between the work items of the workgroup. ```mlir gpu.barrier ``` waits until all work items in the workgroup have reached this point and all memory accesses made by these work items prior to the op are visible to all work items in the workgroup. Data hazards between work items accessing the same memory can be avoided by synchronizing work items in-between these accesses. Either none or all work items of a workgroup need to execute this op in convergence. }]; let parser = [{ return success(); }]; let printer = [{ p << getOperationName(); }]; } def GPU_GPUModuleOp : GPU_Op<"module", [ IsolatedFromAbove, SymbolTable, Symbol, SingleBlockImplicitTerminator<"ModuleEndOp"> ]> { let summary = "A top level compilation unit containing code to be run on a GPU."; let description = [{ GPU module contains code that is intended to be run on a GPU. A host device can launch this code through a gpu.launc_func that creates a fully qualified symbol through the gpu.module's symbol and a gpu.func symbol contained in the gpu.module. The module's top-level scope is modeled by a single region with a single block. GPU modules are required to have a name that is used for symbol resolution by the gpu.launch_func operation. Using an op with a region to define a GPU module enables "embedding" GPU modules with SIMT execution models in other dialects in a clean manner and allows filtering of code regions to execute passes on only code intended to or not intended to be run on the separate device. ``` gpu.module @symbol_name { gpu.func {} ... gpu.module_end } ``` }]; let builders = [OpBuilderDAG<(ins "StringRef":$name)>]; let parser = [{ return ::parseGPUModuleOp(parser, result); }]; let printer = [{ return ::print(p, *this); }]; let regions = (region SizedRegion<1>:$body); // We need to ensure the block inside the region is properly terminated; // the auto-generated builders do not guarantee that. let skipDefaultBuilders = 1; } def GPU_ModuleEndOp : GPU_Op<"module_end", [ Terminator, HasParent<"GPUModuleOp"> ]> { let summary = "A pseudo op that marks the end of a gpu.module."; let description = [{ This op terminates the only block inside the only region of a `gpu.module`. }]; let parser = [{ return success(); }]; let printer = [{ p << getOperationName(); }]; } def GPU_HostRegisterOp : GPU_Op<"host_register">, Arguments<(ins AnyUnrankedMemRef:$value)> { let summary = "Registers a memref for access from device."; let description = [{ This op maps the provided host buffer into the device address space. This operation may not be supported in every environment, there is not yet a way to check at runtime whether this feature is supported. Writes from the host are guaranteed to be visible to device kernels that are launched afterwards. Writes from the device are guaranteed to be visible on the host after synchronizing with the device kernel completion. }]; let assemblyFormat = "$value attr-dict `:` type($value)"; let verifier = [{ return success(); }]; } def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { let summary = "Wait for async gpu ops to complete."; let description = [{ This op synchronizes the host or the device with a list of dependent ops. If the op contains the `async` keyword, it returns a new async token which is synchronized with the op arguments. This new token is merely a shortcut to the argument list, and one could replace the uses of the result with the arguments for the same effect. The async version of this op is primarily used to make each async token have a single use during lowering and thereby make forks in async execution explicit. Example usage: ```mlir %t0 = gpu.foo async : !gpu.async.token %t1 = gpu.bar async : !gpu.async.token %t2 = gpu.wait async [%t0, %t1] // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just // as if the async dependencies were [%t0, %t1]. %t3 = gpu.baz async [%t2] ``` If the op does not contain the `async` keyword, it does not return a new async token but blocks until all ops producing the async dependency tokens finished execution. All dependent memory operations are visible to the host once this op completes. Example usage: ```mlir %t0 = gpu.foo async : !gpu.async.token %t1 = gpu.bar async : !gpu.async.token // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. gpu.wait [%t0, %t1] ``` }]; let arguments = (ins Variadic:$asyncDependencies); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) attr-dict }]; } #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 9514d3e0a04b..8b0f7a3829d2 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1,864 +1,871 @@ //===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the GPU kernel-related dialect and its operations. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/FunctionImplementation.h" #include "mlir/IR/Module.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" #include "llvm/ADT/TypeSwitch.h" using namespace mlir; using namespace mlir::gpu; //===----------------------------------------------------------------------===// // GPUDialect //===----------------------------------------------------------------------===// bool GPUDialect::isKernel(Operation *op) { UnitAttr isKernelAttr = op->getAttrOfType(getKernelFuncAttrName()); return static_cast(isKernelAttr); } void GPUDialect::initialize() { addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/GPUOps.cpp.inc" >(); } Type GPUDialect::parseType(DialectAsmParser &parser) const { // Parse the main keyword for the type. StringRef keyword; if (parser.parseKeyword(&keyword)) return Type(); MLIRContext *context = getContext(); // Handle 'async token' types. if (keyword == "async.token") return AsyncTokenType::get(context); parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); return Type(); } void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { TypeSwitch(type) .Case([&](Type) { os << "async.token"; }) .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); }); } LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, NamedAttribute attr) { if (!attr.second.isa() || attr.first != getContainerModuleAttrName()) return success(); auto module = dyn_cast(op); if (!module) return op->emitError("expected '") << getContainerModuleAttrName() << "' attribute to be attached to '" << ModuleOp::getOperationName() << '\''; auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult { // Ignore launches that are nested more or less deep than functions in the // module we are currently checking. if (!launchOp.getParentOp() || launchOp.getParentOp()->getParentOp() != module) return success(); // Ignore launch ops with missing attributes here. The errors will be // reported by the verifiers of those ops. if (!launchOp.getAttrOfType( LaunchFuncOp::getKernelAttrName())) return success(); // Check that `launch_func` refers to a well-formed GPU kernel module. StringRef kernelModuleName = launchOp.getKernelModuleName(); auto kernelModule = module.lookupSymbol(kernelModuleName); if (!kernelModule) return launchOp.emitOpError() << "kernel module '" << kernelModuleName << "' is undefined"; // Check that `launch_func` refers to a well-formed kernel function. Operation *kernelFunc = module.lookupSymbol(launchOp.kernel()); auto kernelGPUFunction = dyn_cast_or_null(kernelFunc); auto kernelLLVMFunction = dyn_cast_or_null(kernelFunc); if (!kernelGPUFunction && !kernelLLVMFunction) return launchOp.emitOpError("kernel function '") << launchOp.kernel() << "' is undefined"; if (!kernelFunc->getAttrOfType( GPUDialect::getKernelFuncAttrName())) return launchOp.emitOpError("kernel function is missing the '") << GPUDialect::getKernelFuncAttrName() << "' attribute"; // TODO: if the kernel function has been converted to // the LLVM dialect but the caller hasn't (which happens during the // separate compilation), do not check type correspondence as it would // require the verifier to be aware of the LLVM type conversion. if (kernelLLVMFunction) return success(); unsigned actualNumArguments = launchOp.getNumKernelOperands(); unsigned expectedNumArguments = kernelGPUFunction.getNumArguments(); if (expectedNumArguments != actualNumArguments) return launchOp.emitOpError("got ") << actualNumArguments << " kernel operands but expected " << expectedNumArguments; auto functionType = kernelGPUFunction.getType(); for (unsigned i = 0; i < expectedNumArguments; ++i) { if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) { return launchOp.emitOpError("type of function argument ") << i << " does not match"; } } return success(); }); return walkResult.wasInterrupted() ? failure() : success(); } template static LogicalResult verifyIndexOp(T op) { auto dimension = op.dimension(); if (dimension != "x" && dimension != "y" && dimension != "z") return op.emitError("dimension \"") << dimension << "\" is invalid"; return success(); } static LogicalResult verifyAllReduce(gpu::AllReduceOp allReduce) { if (allReduce.body().empty() != allReduce.op().hasValue()) return allReduce.emitError( "expected either an op attribute or a non-empty body"); if (!allReduce.body().empty()) { if (allReduce.body().getNumArguments() != 2) return allReduce.emitError("expected two region arguments"); for (auto argument : allReduce.body().getArguments()) { if (argument.getType() != allReduce.getType()) return allReduce.emitError("incorrect region argument type"); } unsigned yieldCount = 0; for (Block &block : allReduce.body()) { if (auto yield = dyn_cast(block.getTerminator())) { if (yield.getNumOperands() != 1) return allReduce.emitError("expected one gpu.yield operand"); if (yield.getOperand(0).getType() != allReduce.getType()) return allReduce.emitError("incorrect gpu.yield type"); ++yieldCount; } } if (yieldCount == 0) return allReduce.emitError("expected gpu.yield op in region"); } else { StringRef opName = *allReduce.op(); if ((opName == "and" || opName == "or" || opName == "xor") && !allReduce.getType().isa()) { return allReduce.emitError() << '`' << opName << '`' << " accumulator is only compatible with Integer type"; } } return success(); } static LogicalResult verifyShuffleOp(gpu::ShuffleOp shuffleOp) { auto type = shuffleOp.value().getType(); if (shuffleOp.result().getType() != type) { return shuffleOp.emitOpError() << "requires the same type for value operand and result"; } if (!type.isSignlessIntOrFloat() || type.getIntOrFloatBitWidth() != 32) { return shuffleOp.emitOpError() << "requires value operand type to be f32 or i32"; } return success(); } static void printShuffleOp(OpAsmPrinter &p, ShuffleOp op) { p << ShuffleOp::getOperationName() << ' ' << op.getOperands() << ' ' << op.mode() << " : " << op.value().getType(); } static ParseResult parseShuffleOp(OpAsmParser &parser, OperationState &state) { SmallVector operandInfo; if (parser.parseOperandList(operandInfo, 3)) return failure(); StringRef mode; if (parser.parseKeyword(&mode)) return failure(); state.addAttribute("mode", parser.getBuilder().getStringAttr(mode)); Type valueType; Type int32Type = parser.getBuilder().getIntegerType(32); Type int1Type = parser.getBuilder().getI1Type(); if (parser.parseColonType(valueType) || parser.resolveOperands(operandInfo, {valueType, int32Type, int32Type}, parser.getCurrentLocation(), state.operands) || parser.addTypesToList({valueType, int1Type}, state.types)) return failure(); return success(); } //===----------------------------------------------------------------------===// // AsyncOpInterface //===----------------------------------------------------------------------===// void gpu::addAsyncDependency(Operation *op, Value token) { op->insertOperands(0, {token}); if (!op->template hasTrait()) return; auto attrName = OpTrait::AttrSizedOperandSegments::getOperandSegmentSizeAttr(); auto sizeAttr = op->template getAttrOfType(attrName); if (!sizeAttr) return; // Async dependencies is the only variadic operand. SmallVector sizes; for (auto size : sizeAttr.getIntValues()) sizes.push_back(size.getSExtValue()); ++sizes.front(); op->setAttr(attrName, Builder(op->getContext()).getI32VectorAttr(sizes)); } //===----------------------------------------------------------------------===// // LaunchOp //===----------------------------------------------------------------------===// void LaunchOp::build(OpBuilder &builder, OperationState &result, Value gridSizeX, Value gridSizeY, Value gridSizeZ, Value blockSizeX, Value blockSizeY, Value blockSizeZ) { // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); // Create a kernel body region with kNumConfigRegionAttributes + N arguments, // where the first kNumConfigRegionAttributes arguments have `index` type and // the rest have the same types as the data operands. Region *kernelRegion = result.addRegion(); Block *body = new Block(); body->addArguments( std::vector(kNumConfigRegionAttributes, builder.getIndexType())); kernelRegion->push_back(body); } KernelDim3 LaunchOp::getBlockIds() { assert(!body().empty() && "LaunchOp body must not be empty."); auto args = body().getArguments(); return KernelDim3{args[0], args[1], args[2]}; } KernelDim3 LaunchOp::getThreadIds() { assert(!body().empty() && "LaunchOp body must not be empty."); auto args = body().getArguments(); return KernelDim3{args[3], args[4], args[5]}; } KernelDim3 LaunchOp::getGridSize() { assert(!body().empty() && "LaunchOp body must not be empty."); auto args = body().getArguments(); return KernelDim3{args[6], args[7], args[8]}; } KernelDim3 LaunchOp::getBlockSize() { assert(!body().empty() && "LaunchOp body must not be empty."); auto args = body().getArguments(); return KernelDim3{args[9], args[10], args[11]}; } KernelDim3 LaunchOp::getGridSizeOperandValues() { return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; } KernelDim3 LaunchOp::getBlockSizeOperandValues() { return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; } static LogicalResult verify(LaunchOp op) { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. if (!op.body().empty()) { if (op.body().getNumArguments() != LaunchOp::kNumConfigOperands + op.getNumOperands()) return op.emitOpError("unexpected number of region arguments"); } // Block terminators without successors are expected to exit the kernel region // and must be `gpu.terminator`. for (Block &block : op.body()) { if (block.empty()) continue; if (block.back().getNumSuccessors() != 0) continue; if (!isa(&block.back())) { return block.back() .emitError() .append("expected '", gpu::TerminatorOp::getOperationName(), "' or a terminator with successors") .attachNote(op.getLoc()) .append("in '", LaunchOp::getOperationName(), "' body region"); } } return success(); } // Pretty-print the kernel grid/block size assignment as // (%iter-x, %iter-y, %iter-z) in // (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use) // where %size-* and %iter-* will correspond to the body region arguments. static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, KernelDim3 operands, KernelDim3 ids) { p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in ("; p << size.x << " = " << operands.x << ", "; p << size.y << " = " << operands.y << ", "; p << size.z << " = " << operands.z << ')'; } static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { // Print the launch configuration. p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(), op.getBlockIds()); p << ' ' << op.getThreadsKeyword(); printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(), op.getThreadIds()); p.printRegion(op.body(), /*printEntryBlockArgs=*/false); p.printOptionalAttrDict(op.getAttrs()); } // Parse the size assignment blocks for blocks and threads. These have the form // (%region_arg, %region_arg, %region_arg) in // (%region_arg = %operand, %region_arg = %operand, %region_arg = %operand) // where %region_arg are percent-identifiers for the region arguments to be // introduced further (SSA defs), and %operand are percent-identifiers for the // SSA value uses. static ParseResult parseSizeAssignment(OpAsmParser &parser, MutableArrayRef sizes, MutableArrayRef regionSizes, MutableArrayRef indices) { assert(indices.size() == 3 && "space for three indices expected"); SmallVector args; if (parser.parseRegionArgumentList(args, /*requiredOperandCount=*/3, OpAsmParser::Delimiter::Paren) || parser.parseKeyword("in") || parser.parseLParen()) return failure(); std::move(args.begin(), args.end(), indices.begin()); for (int i = 0; i < 3; ++i) { if (i != 0 && parser.parseComma()) return failure(); if (parser.parseRegionArgument(regionSizes[i]) || parser.parseEqual() || parser.parseOperand(sizes[i])) return failure(); } return parser.parseRParen(); } // Parses a Launch operation. // operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment // `threads` `(` ssa-id-list `)` `in` ssa-reassignment // region attr-dict? // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. SmallVector sizes( LaunchOp::kNumConfigOperands); MutableArrayRef sizesRef(sizes); // Actual (data) operands passed to the kernel. SmallVector dataOperands; // Region arguments to be created. SmallVector regionArgs( LaunchOp::kNumConfigRegionAttributes); MutableArrayRef regionArgsRef(regionArgs); // Parse the size assignment segments: the first segment assigns grid sizes // and defines values for block identifiers; the second segment assigns block // sizes and defines values for thread identifiers. In the region argument // list, identifiers precede sizes, and block-related values precede // thread-related values. if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) || parseSizeAssignment(parser, sizesRef.take_front(3), regionArgsRef.slice(6, 3), regionArgsRef.slice(0, 3)) || parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) || parseSizeAssignment(parser, sizesRef.drop_front(3), regionArgsRef.slice(9, 3), regionArgsRef.slice(3, 3)) || parser.resolveOperands(sizes, parser.getBuilder().getIndexType(), result.operands)) return failure(); // Introduce the body region and parse it. The region has // kNumConfigRegionAttributes arguments that correspond to // block/thread identifiers and grid/block sizes, all of the `index` type. Type index = parser.getBuilder().getIndexType(); SmallVector dataTypes( LaunchOp::kNumConfigRegionAttributes, index); Region *body = result.addRegion(); return failure(parser.parseRegion(*body, regionArgs, dataTypes) || parser.parseOptionalAttrDict(result.attributes)); } //===----------------------------------------------------------------------===// // LaunchFuncOp //===----------------------------------------------------------------------===// void LaunchFuncOp::build(OpBuilder &builder, OperationState &result, GPUFuncOp kernelFunc, KernelDim3 gridSize, KernelDim3 blockSize, ValueRange kernelOperands) { // Add grid and block sizes as op operands, followed by the data operands. result.addOperands({gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z}); result.addOperands(kernelOperands); auto kernelModule = kernelFunc.getParentOfType(); auto kernelSymbol = builder.getSymbolRefAttr( kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())}); result.addAttribute(getKernelAttrName(), kernelSymbol); + SmallVector segmentSizes(8, 1); + segmentSizes.front() = 0; // Initially no async dependencies. + segmentSizes.back() = static_cast(kernelOperands.size()); + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getI32VectorAttr(segmentSizes)); } unsigned LaunchFuncOp::getNumKernelOperands() { - return getNumOperands() - kNumConfigOperands; + return getNumOperands() - asyncDependencies().size() - kNumConfigOperands; } StringRef LaunchFuncOp::getKernelModuleName() { return kernel().getRootReference(); } StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); } Value LaunchFuncOp::getKernelOperand(unsigned i) { - return getOperation()->getOperand(i + kNumConfigOperands); + return getOperand(asyncDependencies().size() + kNumConfigOperands + i); } KernelDim3 LaunchFuncOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; } static LogicalResult verify(LaunchFuncOp op) { auto module = op.getParentOfType(); if (!module) return op.emitOpError("expected to belong to a module"); if (!module.getAttrOfType(GPUDialect::getContainerModuleAttrName())) return op.emitOpError( "expected the closest surrounding module to have the '" + GPUDialect::getContainerModuleAttrName() + "' attribute"); auto kernelAttr = op.getAttrOfType(op.getKernelAttrName()); if (!kernelAttr) return op.emitOpError("symbol reference attribute '" + op.getKernelAttrName() + "' must be specified"); return success(); } static ParseResult parseLaunchFuncOperands(OpAsmParser &parser, SmallVectorImpl &argNames, SmallVectorImpl &argTypes) { if (parser.parseOptionalKeyword("args")) return success(); SmallVector argAttrs; bool isVariadic = false; return impl::parseFunctionArgumentList(parser, /*allowAttributes=*/false, /*allowVariadic=*/false, argNames, argTypes, argAttrs, isVariadic); } static void printLaunchFuncOperands(OpAsmPrinter &printer, Operation *, OperandRange operands, TypeRange types) { if (operands.empty()) return; printer << "args("; llvm::interleaveComma(llvm::zip(operands, types), printer, [&](const auto &pair) { printer.printOperand(std::get<0>(pair)); printer << " : "; printer.printType(std::get<1>(pair)); }); printer << ")"; } //===----------------------------------------------------------------------===// // GPUFuncOp //===----------------------------------------------------------------------===// /// Adds a new block argument that corresponds to buffers located in /// workgroup memory. BlockArgument GPUFuncOp::addWorkgroupAttribution(Type type) { auto attrName = getNumWorkgroupAttributionsAttrName(); auto attr = getAttrOfType(attrName); setAttr(attrName, IntegerAttr::get(attr.getType(), attr.getValue() + 1)); return getBody().insertArgument(getType().getNumInputs() + attr.getInt(), type); } /// Adds a new block argument that corresponds to buffers located in /// private memory. BlockArgument GPUFuncOp::addPrivateAttribution(Type type) { // Buffers on the private memory always come after buffers on the workgroup // memory. return getBody().addArgument(type); } void GPUFuncOp::build(OpBuilder &builder, OperationState &result, StringRef name, FunctionType type, TypeRange workgroupAttributions, TypeRange privateAttributions, ArrayRef attrs) { result.addAttribute(SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)); result.addAttribute(getTypeAttrName(), TypeAttr::get(type)); result.addAttribute(getNumWorkgroupAttributionsAttrName(), builder.getI64IntegerAttr(workgroupAttributions.size())); result.addAttributes(attrs); Region *body = result.addRegion(); Block *entryBlock = new Block; entryBlock->addArguments(type.getInputs()); entryBlock->addArguments(workgroupAttributions); entryBlock->addArguments(privateAttributions); body->getBlocks().push_back(entryBlock); } /// Parses a GPU function memory attribution. /// /// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? /// (`private` `(` ssa-id-and-type-list `)`)? /// /// Note that this function parses only one of the two similar parts, with the /// keyword provided as argument. static ParseResult parseAttributions(OpAsmParser &parser, StringRef keyword, SmallVectorImpl &args, SmallVectorImpl &argTypes) { // If we could not parse the keyword, just assume empty list and succeed. if (failed(parser.parseOptionalKeyword(keyword))) return success(); if (failed(parser.parseLParen())) return failure(); // Early exit for an empty list. if (succeeded(parser.parseOptionalRParen())) return success(); do { OpAsmParser::OperandType arg; Type type; if (parser.parseRegionArgument(arg) || parser.parseColonType(type)) return failure(); args.push_back(arg); argTypes.push_back(type); } while (succeeded(parser.parseOptionalComma())); return parser.parseRParen(); } /// Parses a GPU function. /// /// ::= `gpu.func` symbol-ref-id `(` argument-list `)` /// (`->` function-result-list)? memory-attribution `kernel`? /// function-attributes? region static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) { SmallVector entryArgs; SmallVector argAttrs; SmallVector resultAttrs; SmallVector argTypes; SmallVector resultTypes; bool isVariadic; // Parse the function name. StringAttr nameAttr; if (parser.parseSymbolName(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(), result.attributes)) return failure(); auto signatureLocation = parser.getCurrentLocation(); if (failed(impl::parseFunctionSignature( parser, /*allowVariadic=*/false, entryArgs, argTypes, argAttrs, isVariadic, resultTypes, resultAttrs))) return failure(); if (entryArgs.empty() && !argTypes.empty()) return parser.emitError(signatureLocation) << "gpu.func requires named arguments"; // Construct the function type. More types will be added to the region, but // not to the function type. Builder &builder = parser.getBuilder(); auto type = builder.getFunctionType(argTypes, resultTypes); result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type)); // Parse workgroup memory attributions. if (failed(parseAttributions(parser, GPUFuncOp::getWorkgroupKeyword(), entryArgs, argTypes))) return failure(); // Store the number of operands we just parsed as the number of workgroup // memory attributions. unsigned numWorkgroupAttrs = argTypes.size() - type.getNumInputs(); result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(), builder.getI64IntegerAttr(numWorkgroupAttrs)); // Parse private memory attributions. if (failed(parseAttributions(parser, GPUFuncOp::getPrivateKeyword(), entryArgs, argTypes))) return failure(); // Parse the kernel attribute if present. if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword()))) result.addAttribute(GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); // Parse attributes. if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes))) return failure(); mlir::impl::addArgAndResultAttrs(builder, result, argAttrs, resultAttrs); // Parse the region. If no argument names were provided, take all names // (including those of attributions) from the entry block. auto *body = result.addRegion(); return parser.parseRegion(*body, entryArgs, argTypes); } static void printAttributions(OpAsmPrinter &p, StringRef keyword, ArrayRef values) { if (values.empty()) return; p << ' ' << keyword << '('; llvm::interleaveComma( values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); }); p << ')'; } /// Prints a GPU Func op. static void printGPUFuncOp(OpAsmPrinter &p, GPUFuncOp op) { p << GPUFuncOp::getOperationName() << ' '; p.printSymbolName(op.getName()); FunctionType type = op.getType(); impl::printFunctionSignature(p, op.getOperation(), type.getInputs(), /*isVariadic=*/false, type.getResults()); printAttributions(p, op.getWorkgroupKeyword(), op.getWorkgroupAttributions()); printAttributions(p, op.getPrivateKeyword(), op.getPrivateAttributions()); if (op.isKernel()) p << ' ' << op.getKernelKeyword(); impl::printFunctionAttributes(p, op.getOperation(), type.getNumInputs(), type.getNumResults(), {op.getNumWorkgroupAttributionsAttrName(), GPUDialect::getKernelFuncAttrName()}); p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false); } void GPUFuncOp::setType(FunctionType newType) { auto oldType = getType(); assert(newType.getNumResults() == oldType.getNumResults() && "unimplemented: changes to the number of results"); SmallVector nameBuf; for (int i = newType.getNumInputs(), e = oldType.getNumInputs(); i < e; i++) removeAttr(getArgAttrName(i, nameBuf)); setAttr(getTypeAttrName(), TypeAttr::get(newType)); } /// Hook for FunctionLike verifier. LogicalResult GPUFuncOp::verifyType() { Type type = getTypeAttr().getValue(); if (!type.isa()) return emitOpError("requires '" + getTypeAttrName() + "' attribute of function type"); if (isKernel() && getType().getNumResults() != 0) return emitOpError() << "expected void return type for kernel function"; return success(); } static LogicalResult verifyAttributions(Operation *op, ArrayRef attributions, unsigned memorySpace) { for (Value v : attributions) { auto type = v.getType().dyn_cast(); if (!type) return op->emitOpError() << "expected memref type in attribution"; if (type.getMemorySpace() != memorySpace) { return op->emitOpError() << "expected memory space " << memorySpace << " in attribution"; } } return success(); } /// Verifies the body of the function. LogicalResult GPUFuncOp::verifyBody() { unsigned numFuncArguments = getNumArguments(); unsigned numWorkgroupAttributions = getNumWorkgroupAttributions(); unsigned numBlockArguments = front().getNumArguments(); if (numBlockArguments < numFuncArguments + numWorkgroupAttributions) return emitOpError() << "expected at least " << numFuncArguments + numWorkgroupAttributions << " arguments to body region"; ArrayRef funcArgTypes = getType().getInputs(); for (unsigned i = 0; i < numFuncArguments; ++i) { Type blockArgType = front().getArgument(i).getType(); if (funcArgTypes[i] != blockArgType) return emitOpError() << "expected body region argument #" << i << " to be of type " << funcArgTypes[i] << ", got " << blockArgType; } if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(), GPUDialect::getWorkgroupAddressSpace())) || failed(verifyAttributions(getOperation(), getPrivateAttributions(), GPUDialect::getPrivateAddressSpace()))) return failure(); return success(); } //===----------------------------------------------------------------------===// // ReturnOp //===----------------------------------------------------------------------===// static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) { llvm::SmallVector operands; llvm::SmallVector types; if (parser.parseOperandList(operands) || parser.parseOptionalColonTypeList(types) || parser.resolveOperands(operands, types, parser.getCurrentLocation(), result.operands)) return failure(); return success(); } static LogicalResult verify(gpu::ReturnOp returnOp) { GPUFuncOp function = returnOp.getParentOfType(); FunctionType funType = function.getType(); if (funType.getNumResults() != returnOp.operands().size()) return returnOp.emitOpError() .append("expected ", funType.getNumResults(), " result operands") .attachNote(function.getLoc()) .append("return type declared here"); for (auto pair : llvm::enumerate( llvm::zip(function.getType().getResults(), returnOp.operands()))) { Type type; Value operand; std::tie(type, operand) = pair.value(); if (type != operand.getType()) return returnOp.emitOpError() << "unexpected type `" << operand.getType() << "' for operand #" << pair.index(); } return success(); } //===----------------------------------------------------------------------===// // GPUModuleOp //===----------------------------------------------------------------------===// void GPUModuleOp::build(OpBuilder &builder, OperationState &result, StringRef name) { ensureTerminator(*result.addRegion(), builder, result.location); result.attributes.push_back(builder.getNamedAttr( ::mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name))); } static ParseResult parseGPUModuleOp(OpAsmParser &parser, OperationState &result) { StringAttr nameAttr; if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(), result.attributes)) return failure(); // If module attributes are present, parse them. if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) return failure(); // Parse the module body. auto *body = result.addRegion(); if (parser.parseRegion(*body, None, None)) return failure(); // Ensure that this module has a valid terminator. GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location); return success(); } static void print(OpAsmPrinter &p, GPUModuleOp op) { p << op.getOperationName() << ' '; p.printSymbolName(op.getName()); p.printOptionalAttrDictWithKeyword(op.getAttrs(), {SymbolTable::getSymbolAttrName()}); p.printRegion(op.getOperation()->getRegion(0), /*printEntryBlockArgs=*/false, /*printBlockTerminators=*/false); } static ParseResult parseAsyncDependencies( OpAsmParser &parser, Type &asyncTokenType, SmallVectorImpl &asyncDependencies) { auto loc = parser.getCurrentLocation(); if (succeeded(parser.parseOptionalKeyword("async"))) { if (parser.getNumResults() == 0) return parser.emitError(loc, "needs to be named when marked 'async'"); asyncTokenType = parser.getBuilder().getType(); } return parser.parseOperandList(asyncDependencies, OpAsmParser::Delimiter::OptionalSquare); } static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op, Type asyncTokenType, OperandRange asyncDependencies) { if (asyncTokenType) printer << "async "; if (asyncDependencies.empty()) return; printer << "["; llvm::interleaveComma(asyncDependencies, printer); printer << "]"; } #include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc" #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.cpp.inc" diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 3612b8e0dcc1..3dc5be405aac 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -1,444 +1,446 @@ // RUN: mlir-opt -split-input-file -verify-diagnostics %s func @not_enough_sizes(%sz : index) { // expected-error@+1 {{expected 6 operands, but found 5}} "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({ gpu.return }) : (index, index, index, index, index) -> () return } // ----- func @no_region_attrs(%sz : index) { // expected-error@+1 {{unexpected number of region arguments}} "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ ^bb1(%bx: index, %by: index, %bz: index, %tx: index, %ty: index, %tz: index): gpu.return }) : (index, index, index, index, index, index) -> () return } // ----- func @launch_requires_gpu_return(%sz : index) { // @expected-note@+1 {{in 'gpu.launch' body region}} gpu.launch blocks(%bx, %by, %bz) in (%sbx = %sz, %sby = %sz, %sbz = %sz) threads(%tx, %ty, %tz) in (%stx = %sz, %sty = %sz, %stz = %sz) { // @expected-error@+1 {{expected 'gpu.terminator' or a terminator with successors}} return } return } // ----- func @launch_func_too_few_operands(%sz : index) { // expected-error@+1 {{expected 6 or more operands}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>} : (index, index, index, index, index) -> () return } // ----- func @launch_func_missing_parent_module_attribute(%sz : index) { // expected-error@+1 {{expected the closest surrounding module to have the 'gpu.container_module' attribute}} gpu.launch_func @foo::@bar blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) return } // ----- module attributes {gpu.container_module} { func @launch_func_missing_callee_attribute(%sz : index) { // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) + {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> () return } } // ----- module attributes {gpu.container_module} { func @launch_func_no_function_attribute(%sz : index) { // expected-error@+1 {{custom op 'gpu.launch_func' invalid kind of attribute specified}} gpu.launch_func "foo" blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) return } } // ----- module attributes {gpu.container_module} { func @launch_func_undefined_module(%sz : index) { // expected-error@+1 {{kernel module 'kernels' is undefined}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) return } } // ----- module attributes {gpu.container_module} { module @kernels { // expected-error@+1 {{'gpu.func' op expects parent op 'gpu.module'}} gpu.func @kernel_1(%arg1 : !llvm.ptr) { gpu.return } } } // ----- module attributes {gpu.container_module} { module @kernels { } func @launch_func_missing_module_attribute(%sz : index) { // expected-error@+1 {{kernel module 'kernels' is undefined}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) return } } // ----- module attributes {gpu.container_module} { gpu.module @kernels { } func @launch_func_undefined_function(%sz : index) { // expected-error@+1 {{kernel function '@kernels::@kernel_1' is undefined}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) return } } // ----- module attributes {gpu.container_module} { module @kernels { gpu.func @kernel_1(%arg1 : !llvm.ptr) kernel { gpu.return } } func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm.ptr) { // expected-error@+1 {{kernel module 'kernels' is undefined}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : !llvm.ptr) return } } // ----- module attributes {gpu.container_module} { gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm.ptr) { gpu.return } } func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm.ptr) { // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : !llvm.ptr) return } } // ----- module attributes {gpu.container_module} { gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm.ptr) kernel { gpu.return } } func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm.ptr) { // expected-error@+1 {{got 2 kernel operands but expected 1}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : !llvm.ptr, %arg : !llvm.ptr) return } } // ----- module attributes {gpu.container_module} { gpu.module @kernels { gpu.func @kernel_1(%arg1 : f32) kernel { gpu.return } } func @launch_func_kernel_operand_types(%sz : index, %arg : f32) { // expected-err@+1 {{type of function argument 0 does not match}} gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : f32) return } } // ----- module attributes {gpu.container_module} { func @launch_func_kernel_operand_attr(%sz : index) { // expected-error@+1 {{expected arguments without attributes}} gpu.launch_func @foo::@bar blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%sz : index {foo}) return } } // ----- func @illegal_dimension() { // expected-error@+1 {{dimension "o" is invalid}} %tIdX = "gpu.thread_id"() {dimension = "o"} : () -> (index) return } // ----- func @illegal_dimension() { // expected-error@+1 {{dimension "o" is invalid}} %bDimX = "gpu.block_dim"() {dimension = "o"} : () -> (index) return } // ----- func @illegal_dimension() { // expected-error@+1 {{dimension "o" is invalid}} %bIdX = "gpu.block_id"() {dimension = "o"} : () -> (index) return } // ----- func @illegal_dimension() { // expected-error@+1 {{dimension "o" is invalid}} %gDimX = "gpu.grid_dim"() {dimension = "o"} : () -> (index) return } // ----- func @reduce_no_op_no_body(%arg0 : f32) { // expected-error@+1 {{expected either an op attribute or a non-empty body}} %res = "gpu.all_reduce"(%arg0) ({}) : (f32) -> (f32) return } // ----- func @reduce_op_and_body(%arg0 : f32) { // expected-error@+1 {{expected either an op attribute or a non-empty body}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32, %rhs : f32): "gpu.yield"(%lhs) : (f32) -> () }) {op = "add"} : (f32) -> (f32) } // ----- func @reduce_invalid_op(%arg0 : f32) { // expected-error@+1 {{attribute 'op' failed to satisfy constraint}} %res = "gpu.all_reduce"(%arg0) ({}) {op = "foo"} : (f32) -> (f32) return } // ----- func @reduce_invalid_op_type(%arg0 : f32) { // expected-error@+1 {{`and` accumulator is only compatible with Integer type}} %res = "gpu.all_reduce"(%arg0) ({}) {op = "and"} : (f32) -> (f32) return } // ----- func @reduce_incorrect_region_arguments(%arg0 : f32) { // expected-error@+1 {{expected two region arguments}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32): "gpu.yield"(%lhs) : (f32) -> () }) : (f32) -> (f32) } // ----- func @reduce_incorrect_region_arguments(%arg0 : f32) { // expected-error@+1 {{incorrect region argument type}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32, %rhs : i32): "gpu.yield"(%lhs) : (f32) -> () }) : (f32) -> (f32) } // ----- func @reduce_incorrect_yield(%arg0 : f32) { // expected-error@+1 {{expected one gpu.yield operand}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32, %rhs : f32): "gpu.yield"(%lhs, %rhs) : (f32, f32) -> () }) : (f32) -> (f32) } // ----- func @reduce_incorrect_yield(%arg0 : f32) { // expected-error@+1 {{incorrect gpu.yield type}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32, %rhs : f32): %one = constant 1 : i32 "gpu.yield"(%one) : (i32) -> () }) : (f32) -> (f32) } // ----- func @reduce_incorrect_yield(%arg0 : f32) { // expected-error@+1 {{expected gpu.yield op in region}} %res = "gpu.all_reduce"(%arg0) ({ ^bb(%lhs : f32, %rhs : f32): return }) : (f32) -> (f32) } // ----- func @shuffle_mismatching_type(%arg0 : f32, %arg1 : i32, %arg2 : i32) { // expected-error@+1 {{requires the same type for value operand and result}} %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (i32, i1) } // ----- func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) { // expected-error@+1 {{requires value operand type to be f32 or i32}} %shfl, %pred = gpu.shuffle %arg0, %arg1, %arg2 xor : index } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}} gpu.func @kernel_1(f32, f32) { ^bb0(%arg0: f32): gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{requires 'type' attribute of function type}} "gpu.func"() ({ gpu.return }) {sym_name="kernel_1", type=f32} : () -> () } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{expected memref type in attribution}} gpu.func @kernel() workgroup(%0: i32) { gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 3 in attribution}} gpu.func @kernel() workgroup(%0: memref<4xf32>) { gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 5 in attribution}} gpu.func @kernel() private(%0: memref<4xf32>) { gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 5 in attribution}} gpu.func @kernel() private(%0: memref<4xf32>) { gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-note @+1 {{return type declared here}} gpu.func @kernel() { %0 = constant 0 : index // expected-error @+1 {{'gpu.return' op expected 0 result operands}} gpu.return %0 : index } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{'gpu.func' op expected void return type for kernel function}} gpu.func @kernel() -> index kernel { %0 = constant 0 : index gpu.return } } } // ----- module { gpu.module @gpu_funcs { // expected-error @+1 {{'gpu.func' op expected at least 5 arguments to body region}} "gpu.func"() ( { ^bb0(%arg0: f32, %arg1: memref, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>): "gpu.return"() : () -> () } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 3: i64} : () -> () } } // ----- func @sync_wait_with_result() { // expected-error @+1 {{cannot name an operation with no results}} %t = gpu.wait } // ----- func @async_wait_without_result() { // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}} gpu.wait async } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index e81b233abfbc..a3b781afdfbc 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -1,165 +1,169 @@ // RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s module attributes {gpu.container_module} { // CHECK-LABEL:func @no_args(%{{.*}}: index) func @no_args(%sz : index) { // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { // CHECK: gpu.terminator gpu.terminator } return } // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref) { func @args(%blk : index, %thrd : index, %float : f32, %data : memref) { // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { "use"(%float) : (f32) -> () "use"(%data) : (memref) -> () // CHECK: gpu.terminator gpu.terminator } return } gpu.module @kernels { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref) kernel { %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index) %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index) %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index) %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index) %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index) %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index) %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index) %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) %sgId = gpu.subgroup_id : index %numSg = gpu.num_subgroups : index %SgSi = gpu.subgroup_size : index %one = constant 1.0 : f32 %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32) %width = constant 7 : i32 %offset = constant 3 : i32 // CHECK: gpu.shuffle %{{.*}}, %{{.*}}, %{{.*}} xor : f32 %shfl, %pred = gpu.shuffle %arg0, %offset, %width xor : f32 "gpu.barrier"() : () -> () "some_op"(%bIdX, %tIdX) : (index, index) -> () %42 = load %arg1[%bIdX] : memref gpu.return } gpu.func @kernel_2() kernel { gpu.return } } func @foo() { %0 = "op"() : () -> (f32) %1 = "op"() : () -> (memref) // CHECK: %{{.*}} = constant 8 %cst = constant 8 : index + %t0 = gpu.wait async // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref) // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) + %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) + return } gpu.module @gpu_funcs { // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) // CHECK: workgroup // CHECK: private // CHECK: attributes gpu.func @kernel_1(%arg0: f32) workgroup(%arg1: memref<42xf32, 3>) private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>) kernel attributes {foo="bar"} { "use"(%arg1) : (memref<42xf32, 3>) -> () "use"(%arg2) : (memref<2xf32, 5>) -> () "use"(%arg3) : (memref<1xf32, 5>) -> () gpu.return } // CHECK-LABEL: gpu.func @no_attribution // CHECK: { gpu.func @no_attribution(%arg0: f32) { gpu.return } // CHECK-LABEL: @no_attribution_attrs // CHECK: attributes // CHECK: { gpu.func @no_attribution_attrs(%arg0: f32) attributes {foo="bar"} { gpu.return } // CHECK-LABEL: @workgroup_only // CHECK: workgroup({{.*}}: {{.*}}) // CHECK: { gpu.func @workgroup_only() workgroup(%arg0: memref<42xf32, 3>) { gpu.return } // CHECK-LABEL: @private_only // CHECK: private({{.*}}: {{.*}}) // CHECK: { gpu.func @private_only() private(%arg0: memref<2xf32, 5>) { gpu.return } // CHECK-LABEL: @empty_attribution // CHECK: { gpu.func @empty_attribution(%arg0: f32) workgroup() private() { gpu.return } } gpu.module @explicit_attributions { // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32, {{.*}}: memref) workgroup({{.*}}: memref<5xf32, 3>) private({{.*}}: memref<5xf32, 5>) "gpu.func"() ( { ^bb0(%arg0: f32, %arg1: memref, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>): "gpu.return"() : () -> () } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 1: i64} : () -> () } func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token { // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token) // CHECK: return {{.*}} : !gpu.async.token return %arg0 : !gpu.async.token } func @async_wait() { // CHECK-LABEL: func @async_wait // CHECK: %[[t0:.*]] = gpu.wait async %0 = gpu.wait async // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]] %1 = gpu.wait async [%0] // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]] %2 = gpu.wait async [%0, %1] // CHECK: gpu.wait [%[[t0]], %[[t1]]] // CHECK-NOT: async gpu.wait [%0, %1] // CHECK: gpu.wait // CHECK-NOT: async gpu.wait // Valid, but a no-op. return } }