diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -407,7 +407,8 @@ let builders = [ OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, - "ValueRange":$kernelOperands)> + "ValueRange":$kernelOperands, + CArg<"ValueRange", "{}">:$asyncDependencies)> ]; let extraClassDeclaration = [{ @@ -453,25 +454,32 @@ let hasVerifier = 1; } -def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>, - Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, +def GPU_LaunchOp : GPU_Op<"launch", + [AutomaticAllocationScope, AttrSizedOperandSegments]>, + Arguments<(ins Variadic:$asyncDependencies, + Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Optional:$dynamicSharedMemorySize)>, - Results<(outs)> { + Results<(outs Optional:$asyncToken)> { let summary = "GPU kernel launch operation"; let description = [{ Launch a kernel on the specified grid of thread blocks. The body of the kernel is defined by the single region that this operation contains. The - operation takes six operands followed by an optional operand: the first - three operands are grid sizes along the x,y,z dimensions and the following - three are block sizes along the x,y,z dimensions. The last operand is - optional and corresponds to the amount of dynamic shared memory a kernel's - workgroup should be allocated; when this operand is not present, a zero size - is assumed. - - When a lower-dimensional kernel is required, unused sizes must - be explicitly set to `1`. + operation takes an optional list of async dependencies followed by six + operands and an optional operand. + + The kernel launched does not start executing until the ops producing its + async dependencies have completed. The operation returns a new + !gpu.async.token if there is at least one asynchronous dependency specified + among its async operands. + + The first three operands (following any async dependencies) are grid sizes + along the x,y,z dimensions and the following three are block sizes along the + x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes + must be explicitly set to `1`. The last operand is optional and corresponds + to the amount of dynamic shared memory a kernel's workgroup should be + allocated; when this operand is not present, a zero size is assumed. The body region has _twelve_ arguments, grouped as follows: @@ -483,7 +491,8 @@ Syntax: ``` - operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment + operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? + `block` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment (dynamic_shared_memory_size ssa-use)? region attr-dict? @@ -535,7 +544,8 @@ OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY, "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, "Value":$blockSizeZ, - CArg<"Value", "nullptr">:$dynamic_shared_memory_size)> + CArg<"Value", "nullptr">:$dynamic_shared_memory_size, + CArg<"ValueRange", "{}">:$asyncDependencies)> ]; let extraClassDeclaration = [{ diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -358,7 +358,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, Value gridSizeX, Value gridSizeY, Value gridSizeZ, Value blockSizeX, Value blockSizeY, Value blockSizeZ, - Value dynamicSharedMemorySize) { + Value dynamicSharedMemorySize, + ValueRange asyncDependencies) { + result.addOperands(asyncDependencies); + if (!asyncDependencies.empty()) + result.types.push_back(builder.getType()); + // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); @@ -373,6 +378,11 @@ for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i) body->addArgument(builder.getIndexType(), result.location); kernelRegion->push_back(body); + SmallVector segmentSizes(8, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getI32VectorAttr(segmentSizes)); } KernelDim3 LaunchOp::getBlockIds() { @@ -400,11 +410,13 @@ } KernelDim3 LaunchOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; } LogicalResult LaunchOp::verifyRegions() { @@ -412,9 +424,9 @@ // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. if (!body().empty()) { - if (body().getNumArguments() != LaunchOp::kNumConfigOperands + - getNumOperands() - - (dynamicSharedMemorySize() ? 1 : 0)) + if (body().getNumArguments() != + LaunchOp::kNumConfigOperands + getNumOperands() - + (dynamicSharedMemorySize() ? 1 : 0) - asyncDependencies().size()) return emitOpError("unexpected number of region arguments"); } @@ -435,6 +447,10 @@ } } + if (!asyncDependencies().empty() && !asyncToken()) + return emitOpError( + "needs to be named when async dependencies are specified"); + return success(); } @@ -451,6 +467,8 @@ } void LaunchOp::print(OpAsmPrinter &p) { + if (!asyncDependencies().empty()) + p << " async [" << asyncDependencies() << "]"; // Print the launch configuration. p << ' ' << getBlocksKeyword(); printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(), @@ -464,7 +482,8 @@ p << ' '; p.printRegion(body(), /*printEntryBlockArgs=*/false); - p.printOptionalAttrDict((*this)->getAttrs()); + p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{ + LaunchOp::getOperandSegmentSizeAttr()}); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -497,12 +516,24 @@ return parser.parseRParen(); } +static ParseResult parseAsyncDependencies( + OpAsmParser &parser, Type &asyncTokenType, + SmallVectorImpl &asyncDependencies) { + auto loc = parser.getCurrentLocation(); + if (succeeded(parser.parseOptionalKeyword("async"))) { + if (parser.getNumResults() == 0) + return parser.emitError(loc, "needs to be named when marked 'async'"); + asyncTokenType = parser.getBuilder().getType(); + } + return parser.parseOperandList(asyncDependencies, + OpAsmParser::Delimiter::OptionalSquare); +} + /// Parses a Launch operation. -/// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` -/// ssa-reassignment -/// `threads` `(` ssa-id-list `)` `in` -/// ssa-reassignment -/// region attr-dict? +/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? +// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment +/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment +/// region attr-dict? /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. @@ -518,6 +549,17 @@ LaunchOp::kNumConfigRegionAttributes); MutableArrayRef regionArgsRef(regionArgs); + // Parse optional async dependencies. + SmallVector asyncDependencies; + Type asyncTokenType; + if (failed( + parseAsyncDependencies(parser, asyncTokenType, asyncDependencies)) || + parser.resolveOperands(asyncDependencies, asyncTokenType, + result.operands)) + return failure(); + if (!asyncDependencies.empty()) + result.types.push_back(asyncTokenType); + // Parse the size assignment segments: the first segment assigns grid sizes // and defines values for block identifiers; the second segment assigns block // sizes and defines values for thread identifiers. In the region argument @@ -536,13 +578,16 @@ return failure(); OpAsmParser::UnresolvedOperand dynamicSharedMemorySize; + bool hasDynamicSharedMemorySize = false; if (!parser.parseOptionalKeyword( - LaunchOp::getDynamicSharedMemorySizeKeyword())) + LaunchOp::getDynamicSharedMemorySizeKeyword())) { + hasDynamicSharedMemorySize = true; if (parser.parseOperand(dynamicSharedMemorySize) || parser.resolveOperand(dynamicSharedMemorySize, parser.getBuilder().getI32Type(), result.operands)) return failure(); + } // Introduce the body region and parse it. The region has // kNumConfigRegionAttributes arguments that correspond to @@ -551,8 +596,16 @@ SmallVector dataTypes( LaunchOp::kNumConfigRegionAttributes, index); Region *body = result.addRegion(); - return failure(parser.parseRegion(*body, regionArgs, dataTypes) || - parser.parseOptionalAttrDict(result.attributes)); + if (parser.parseRegion(*body, regionArgs, dataTypes) || + parser.parseOptionalAttrDict(result.attributes)) + return failure(); + + SmallVector segmentSizes(8, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0; + result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(), + parser.getBuilder().getI32VectorAttr(segmentSizes)); + return success(); } /// Simplify the gpu.launch when the range of a thread or block ID is @@ -602,7 +655,12 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result, GPUFuncOp kernelFunc, KernelDim3 gridSize, KernelDim3 blockSize, Value dynamicSharedMemorySize, - ValueRange kernelOperands) { + ValueRange kernelOperands, + ValueRange asyncDependencies) { + result.addOperands(asyncDependencies); + if (!asyncDependencies.empty()) + result.types.push_back(builder.getType()); + // Add grid and block sizes as op operands, followed by the data operands. result.addOperands({gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z}); @@ -615,7 +673,7 @@ {SymbolRefAttr::get(kernelFunc.getNameAttr())}); result.addAttribute(getKernelAttrName(), kernelSymbol); SmallVector segmentSizes(9, 1); - segmentSizes.front() = 0; // Initially no async dependencies. + segmentSizes.front() = asyncDependencies.size(); segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0; segmentSizes.back() = static_cast(kernelOperands.size()); result.addAttribute(getOperandSegmentSizeAttr(), @@ -1039,19 +1097,6 @@ return success(); } -static ParseResult parseAsyncDependencies( - OpAsmParser &parser, Type &asyncTokenType, - SmallVectorImpl &asyncDependencies) { - auto loc = parser.getCurrentLocation(); - if (succeeded(parser.parseOptionalKeyword("async"))) { - if (parser.getNumResults() == 0) - return parser.emitError(loc, "needs to be named when marked 'async'"); - asyncTokenType = parser.getBuilder().getType(); - } - return parser.parseOperandList(asyncDependencies, - OpAsmParser::Delimiter::OptionalSquare); -} - static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op, Type asyncTokenType, OperandRange asyncDependencies) { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -225,10 +225,11 @@ OpBuilder builder(launchOp); // The launch op has an optional dynamic shared memory size. If it doesn't // exist, we use zero. - builder.create( + auto launchFunc = builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), launchOp.dynamicSharedMemorySize(), - operands); + operands, launchOp.asyncDependencies()); + launchOp.replaceAllUsesWith(launchFunc); launchOp.erase(); } diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -4,7 +4,7 @@ // expected-error@+1 {{expected 6 or more operands, but found 5}} "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({ gpu.return - }) : (index, index, index, index, index) -> () + }) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index) -> () return } @@ -12,11 +12,11 @@ func @no_region_attrs(%sz : index) { // expected-error@+1 {{unexpected number of region arguments}} - "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ + "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ ^bb1(%bx: index, %by: index, %bz: index, %tx: index, %ty: index, %tz: index): gpu.terminator - }) : (index, index, index, index, index, index) -> () + }) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> () return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -1,4 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s module attributes {gpu.container_module} { @@ -26,6 +30,17 @@ return } + // CHECK-LABEL:func @launch_async(%{{.*}}: index, %{{.*}}: index) { + func @launch_async(%blk : index, %thrd : index) { + // CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) + %t = gpu.wait async + %name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) + threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { + gpu.terminator + } + return + } + gpu.module @kernels { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref) kernel { %tIdX = gpu.thread_id x diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -80,6 +80,17 @@ %block_z2 = %cst) { gpu.terminator } + + // CHECK: %[[TOKEN:.*]] = gpu.wait async + // CHECK: gpu.launch_func async [%[[TOKEN]]] @multiple_launches_kernel_1::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) + %t = gpu.wait async + %u = gpu.launch async [%t] blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, + %grid_z2 = %cst) + threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, + %block_z2 = %cst) { + gpu.terminator + } + return }