diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -420,7 +420,9 @@ let builders = [ OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, - "ValueRange":$kernelOperands)> + "ValueRange":$kernelOperands, + CArg<"Type", "nullptr">:$asyncTokenType, + CArg<"ValueRange", "{}">:$asyncDependencies)> ]; let extraClassDeclaration = [{ @@ -466,25 +468,32 @@ let hasVerifier = 1; } -def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>, - Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, +def GPU_LaunchOp : GPU_Op<"launch", + [AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface]>, + Arguments<(ins Variadic:$asyncDependencies, + Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Optional:$dynamicSharedMemorySize)>, - Results<(outs)> { + Results<(outs Optional:$asyncToken)> { let summary = "GPU kernel launch operation"; let description = [{ Launch a kernel on the specified grid of thread blocks. The body of the kernel is defined by the single region that this operation contains. The - operation takes six operands followed by an optional operand: the first - three operands are grid sizes along the x,y,z dimensions and the following - three are block sizes along the x,y,z dimensions. The last operand is - optional and corresponds to the amount of dynamic shared memory a kernel's - workgroup should be allocated; when this operand is not present, a zero size - is assumed. - - When a lower-dimensional kernel is required, unused sizes must - be explicitly set to `1`. + operation takes an optional list of async dependencies followed by six + operands and an optional operand. + + The `async` keyword indicates the kernel should be launched asynchronously; + the operation returns a new !gpu.async.token when the keyword is specified. + The kernel launched does not start executing until the ops producing its + async dependencies (optional operands) have completed. + + The first three operands (following any async dependencies) are grid sizes + along the x,y,z dimensions and the following three are block sizes along the + x,y,z dimensions. When a lower-dimensional kernel is required, unused sizes + must be explicitly set to `1`. The last operand is optional and corresponds + to the amount of dynamic shared memory a kernel's workgroup should be + allocated; when this operand is not present, a zero size is assumed. The body region has _twelve_ arguments, grouped as follows: @@ -496,7 +505,8 @@ Syntax: ``` - operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment + operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )? + `block` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment (dynamic_shared_memory_size ssa-use)? region attr-dict? @@ -548,7 +558,9 @@ OpBuilder<(ins "Value":$gridSizeX, "Value":$gridSizeY, "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, "Value":$blockSizeZ, - CArg<"Value", "nullptr">:$dynamic_shared_memory_size)> + CArg<"Value", "nullptr">:$dynamicSharedMemorySize, + CArg<"Type", "nullptr">:$asyncTokenType, + CArg<"ValueRange", "{}">:$asyncDependencies)> ]; let extraClassDeclaration = [{ diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -275,6 +275,40 @@ return walkResult.wasInterrupted() ? failure() : success(); } +/// Parses an optional list of async operands. +/// (`async` `[` ssa-id-list `]`)? +/// +/// This method is used by the tablegen assembly format for async ops as well. +static ParseResult parseAsyncDependencies( + OpAsmParser &parser, Type &asyncTokenType, + SmallVectorImpl &asyncDependencies) { + auto loc = parser.getCurrentLocation(); + if (succeeded(parser.parseOptionalKeyword("async"))) { + if (parser.getNumResults() == 0) + return parser.emitError(loc, "needs to be named when marked 'async'"); + asyncTokenType = parser.getBuilder().getType(); + } + return parser.parseOperandList(asyncDependencies, + OpAsmParser::Delimiter::OptionalSquare); +} + +// Used by the tablegen assembly format for several async ops. +static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op, + Type asyncTokenType, + OperandRange asyncDependencies) { + if (asyncTokenType) + printer << "async "; + if (asyncDependencies.empty()) + return; + printer << '['; + llvm::interleaveComma(asyncDependencies, printer); + printer << ']'; +} + +//===----------------------------------------------------------------------===// +// AllReduceOp +//===----------------------------------------------------------------------===// + LogicalResult gpu::AllReduceOp::verifyRegions() { if (body().empty() != op().hasValue()) return emitError("expected either an op attribute or a non-empty body"); @@ -358,7 +392,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, Value gridSizeX, Value gridSizeY, Value gridSizeZ, Value blockSizeX, Value blockSizeY, Value blockSizeZ, - Value dynamicSharedMemorySize) { + Value dynamicSharedMemorySize, Type asyncTokenType, + ValueRange asyncDependencies) { + result.addOperands(asyncDependencies); + if (asyncTokenType) + result.types.push_back(builder.getType()); + // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); @@ -373,6 +412,11 @@ for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i) body->addArgument(builder.getIndexType(), result.location); kernelRegion->push_back(body); + SmallVector segmentSizes(8, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getI32VectorAttr(segmentSizes)); } KernelDim3 LaunchOp::getBlockIds() { @@ -400,11 +444,13 @@ } KernelDim3 LaunchOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(asyncDependencies().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; } LogicalResult LaunchOp::verifyRegions() { @@ -412,9 +458,9 @@ // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. if (!body().empty()) { - if (body().getNumArguments() != LaunchOp::kNumConfigOperands + - getNumOperands() - - (dynamicSharedMemorySize() ? 1 : 0)) + if (body().getNumArguments() != + LaunchOp::kNumConfigOperands + getNumOperands() - + (dynamicSharedMemorySize() ? 1 : 0) - asyncDependencies().size()) return emitOpError("unexpected number of region arguments"); } @@ -435,6 +481,10 @@ } } + if (getNumResults() == 0 && asyncToken()) + return emitOpError( + "needs to be named when async dependencies are specified"); + return success(); } @@ -451,6 +501,11 @@ } void LaunchOp::print(OpAsmPrinter &p) { + if (asyncToken()) { + p << " async"; + if (!asyncDependencies().empty()) + p << " [" << asyncDependencies() << ']'; + } // Print the launch configuration. p << ' ' << getBlocksKeyword(); printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(), @@ -464,7 +519,8 @@ p << ' '; p.printRegion(body(), /*printEntryBlockArgs=*/false); - p.printOptionalAttrDict((*this)->getAttrs()); + p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{ + LaunchOp::getOperandSegmentSizeAttr()}); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -498,11 +554,10 @@ } /// Parses a Launch operation. -/// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` -/// ssa-reassignment -/// `threads` `(` ssa-id-list `)` `in` -/// ssa-reassignment -/// region attr-dict? +/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? +// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment +/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment +/// region attr-dict? /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. @@ -518,6 +573,17 @@ LaunchOp::kNumConfigRegionAttributes); MutableArrayRef regionArgsRef(regionArgs); + // Parse optional async dependencies. + SmallVector asyncDependencies; + Type asyncTokenType; + if (failed( + parseAsyncDependencies(parser, asyncTokenType, asyncDependencies)) || + parser.resolveOperands(asyncDependencies, asyncTokenType, + result.operands)) + return failure(); + if (parser.getNumResults() > 0) + result.types.push_back(asyncTokenType); + // Parse the size assignment segments: the first segment assigns grid sizes // and defines values for block identifiers; the second segment assigns block // sizes and defines values for thread identifiers. In the region argument @@ -536,13 +602,16 @@ return failure(); OpAsmParser::UnresolvedOperand dynamicSharedMemorySize; + bool hasDynamicSharedMemorySize = false; if (!parser.parseOptionalKeyword( - LaunchOp::getDynamicSharedMemorySizeKeyword())) + LaunchOp::getDynamicSharedMemorySizeKeyword())) { + hasDynamicSharedMemorySize = true; if (parser.parseOperand(dynamicSharedMemorySize) || parser.resolveOperand(dynamicSharedMemorySize, parser.getBuilder().getI32Type(), result.operands)) return failure(); + } // Introduce the body region and parse it. The region has // kNumConfigRegionAttributes arguments that correspond to @@ -551,8 +620,16 @@ SmallVector dataTypes( LaunchOp::kNumConfigRegionAttributes, index); Region *body = result.addRegion(); - return failure(parser.parseRegion(*body, regionArgs, dataTypes) || - parser.parseOptionalAttrDict(result.attributes)); + if (parser.parseRegion(*body, regionArgs, dataTypes) || + parser.parseOptionalAttrDict(result.attributes)) + return failure(); + + SmallVector segmentSizes(8, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0; + result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(), + parser.getBuilder().getI32VectorAttr(segmentSizes)); + return success(); } /// Simplify the gpu.launch when the range of a thread or block ID is @@ -602,7 +679,12 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result, GPUFuncOp kernelFunc, KernelDim3 gridSize, KernelDim3 blockSize, Value dynamicSharedMemorySize, - ValueRange kernelOperands) { + ValueRange kernelOperands, Type asyncTokenType, + ValueRange asyncDependencies) { + result.addOperands(asyncDependencies); + if (asyncTokenType) + result.types.push_back(builder.getType()); + // Add grid and block sizes as op operands, followed by the data operands. result.addOperands({gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z}); @@ -615,7 +697,7 @@ {SymbolRefAttr::get(kernelFunc.getNameAttr())}); result.addAttribute(getKernelAttrName(), kernelSymbol); SmallVector segmentSizes(9, 1); - segmentSizes.front() = 0; // Initially no async dependencies. + segmentSizes.front() = asyncDependencies.size(); segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0; segmentSizes.back() = static_cast(kernelOperands.size()); result.addAttribute(getOperandSegmentSizeAttr(), @@ -1039,31 +1121,6 @@ return success(); } -static ParseResult parseAsyncDependencies( - OpAsmParser &parser, Type &asyncTokenType, - SmallVectorImpl &asyncDependencies) { - auto loc = parser.getCurrentLocation(); - if (succeeded(parser.parseOptionalKeyword("async"))) { - if (parser.getNumResults() == 0) - return parser.emitError(loc, "needs to be named when marked 'async'"); - asyncTokenType = parser.getBuilder().getType(); - } - return parser.parseOperandList(asyncDependencies, - OpAsmParser::Delimiter::OptionalSquare); -} - -static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op, - Type asyncTokenType, - OperandRange asyncDependencies) { - if (asyncTokenType) - printer << "async "; - if (asyncDependencies.empty()) - return; - printer << "["; - llvm::interleaveComma(asyncDependencies, printer); - printer << "]"; -} - //===----------------------------------------------------------------------===// // GPU_SubgroupMmaLoadMatrixOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -225,10 +225,13 @@ OpBuilder builder(launchOp); // The launch op has an optional dynamic shared memory size. If it doesn't // exist, we use zero. - builder.create( + Value asyncToken = launchOp.asyncToken(); + auto launchFunc = builder.create( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), launchOp.dynamicSharedMemorySize(), - operands); + operands, asyncToken ? asyncToken.getType() : nullptr, + launchOp.asyncDependencies()); + launchOp.replaceAllUsesWith(launchFunc); launchOp.erase(); } diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -4,7 +4,7 @@ // expected-error@+1 {{expected 6 or more operands, but found 5}} "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({ gpu.return - }) : (index, index, index, index, index) -> () + }) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index) -> () return } @@ -12,11 +12,11 @@ func @no_region_attrs(%sz : index) { // expected-error@+1 {{unexpected number of region arguments}} - "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ + "gpu.launch"(%sz, %sz, %sz, %sz, %sz, %sz) ({ ^bb1(%bx: index, %by: index, %bz: index, %tx: index, %ty: index, %tz: index): gpu.terminator - }) : (index, index, index, index, index, index) -> () + }) {operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>} : (index, index, index, index, index, index) -> () return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -1,4 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s module attributes {gpu.container_module} { @@ -26,6 +30,32 @@ return } + // CHECK-LABEL:func @launch_async(%{{.*}}: index, %{{.*}}: index) { + func @launch_async(%blk : index, %thrd : index) { + // CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) + %t = gpu.wait async + %name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) + threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { + gpu.terminator + } + return + } + + // CHECK-LABEL:func @launch_async_no_deps(%{{.*}}: index, %{{.*}}: index) { + func @launch_async_no_deps(%blk : index, %thrd : index) { + // CHECK: %{{.*}} = gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) + %t0 = gpu.launch async blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) + threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { + gpu.terminator + } + // CHECK: gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) + %t1 = gpu.launch async [] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) + threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { + gpu.terminator + } + return + } + gpu.module @kernels { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref) kernel { %tIdX = gpu.thread_id x diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -80,6 +80,17 @@ %block_z2 = %cst) { gpu.terminator } + + // CHECK: %[[TOKEN:.*]] = gpu.wait async + // CHECK: gpu.launch_func async [%[[TOKEN]]] @multiple_launches_kernel_1::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) + %t = gpu.wait async + %u = gpu.launch async [%t] blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, + %grid_z2 = %cst) + threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, + %block_z2 = %cst) { + gpu.terminator + } + return }