diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1,6 +1,11 @@ add_mlir_dialect(GPUOps gpu) add_mlir_doc(GPUOps -gen-op-doc GPUOps Dialects/) +set(LLVM_TARGET_DEFINITIONS GPUBase.td) +mlir_tablegen(GPUOpInterfaces.h.inc -gen-op-interface-decls) +mlir_tablegen(GPUOpInterfaces.cpp.inc -gen-op-interface-defs) +add_public_tablegen_target(MLIRGPUOpInterfacesIncGen) + set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls) mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs) diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -53,4 +53,53 @@ }]; } +def GPU_AsyncToken : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token type">, + BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">; + +def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> { + let description = [{ + Interface for GPU operations that execute asynchronously on the device. + + GPU operations implementing this interface take a list of dependencies + as `gpu.async.token` arguments and optionally return a `gpu.async.token`. + + The op doesn't start executing until all depent ops producing the async + dependency tokens have finished executing. + + If the op returns a token, the op merely schedules the execution on the + device and returns immediately, without waiting for the execution to + complete. On the hand, if the op does not return a token, the op will wait + for the execution to complete. + }]; + let cppNamespace = "::mlir::gpu"; + + let methods = [ + InterfaceMethod<[{ + Query the operands that represent async dependency tokens. + }], + "OperandRange", "getAsyncDependencies", (ins), [{}], [{ + ConcreteOp op = cast(this->getOperation()); + return op.asyncDependencies(); + }] + >, + InterfaceMethod<[{ + Adds a new token to the list of async dependencies. + }], + "void", "addAsyncDependency", (ins "Value":$token), + [{}], [{ + gpu::addAsyncDependency(this->getOperation(), token); + }] + >, + InterfaceMethod<[{ + Query the result that represents the async token to depend on. + }], + "OpResult", "getAsyncToken", (ins), [{}], [{ + ConcreteOp op = cast(this->getOperation()); + return op.asyncToken().template dyn_cast_or_null(); + }] + > + ]; +} + #endif // GPU_BASE diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -14,6 +14,7 @@ #ifndef MLIR_DIALECT_GPU_GPUDIALECT_H #define MLIR_DIALECT_GPU_GPUDIALECT_H +#include "mlir/IR/Builders.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/FunctionSupport.h" #include "mlir/IR/OpDefinition.h" @@ -34,13 +35,24 @@ Value z; }; +class AsyncTokenType + : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + +// Adds a `gpu.async.token` to the front of the argument list. +void addAsyncDependency(Operation *op, Value token); + } // end namespace gpu } // end namespace mlir #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc" +#include "mlir/Dialect/GPU/GPUOpInterfaces.h.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.h.inc" - #endif // MLIR_DIALECT_GPU_GPUDIALECT_H diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -249,7 +249,7 @@ return getBody().getNumArguments() - getType().getNumInputs() - getNumWorkgroupAttributions(); } - + /// Returns a list of block arguments that correspond to buffers located in /// the private memory. ArrayRef getPrivateAttributions() { @@ -301,7 +301,7 @@ IntOrIndex:$blockSizeY, IntOrIndex:$blockSizeZ, Variadic:$operands)>, Results<(outs)> { - let summary = "Launches a function as a GPU kerneel"; + let summary = "Launches a function as a GPU kernel"; let description = [{ Launch a kernel function on the specified grid of thread blocks. @@ -756,4 +756,47 @@ let verifier = [{ return success(); }]; } +def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { + let summary = "Wait for async gpu ops to complete."; + let description = [{ + This op synchronizes the host or the device with a list of dependent ops. + + If the op contains the `async` keyword, it returns a new async token which + is synchronized with the op arguments. This new token is merely a shortcut + to the argument list, and one could replace the uses of the result with the + arguments for the same effect. The async version of this op is primarily + used to make each async token have a single use during lowering and + thereby make forks in async execution explicit. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + %t2 = gpu.wait async [%t0, %t1] + // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just + // as if the async dependencies were [%t0, %t1]. + %t3 = gpu.baz async [%t2] + ``` + + If the op does not contain the `async` keyword, it does not return a new + async token but blocks until all ops producing the async dependency tokens + finished execution. All dependent memory operations are visible to the host + once this op completes. Example usage: + + ```mlir + %t0 = gpu.foo async : !gpu.async.token + %t1 = gpu.bar async : !gpu.async.token + // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. + gpu.wait [%t0, %t1] + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken)) + (`[` $asyncDependencies^ `]`)? attr-dict + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -10,6 +10,7 @@ DEPENDS MLIRGPUOpsIncGen + MLIRGPUOpInterfacesIncGen MLIRGPUPassIncGen MLIRParallelLoopMapperAttrGen MLIRParallelLoopMapperEnumsGen diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -16,12 +16,13 @@ #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" -#include "mlir/IR/Function.h" +#include "mlir/IR/DialectImplementation.h" #include "mlir/IR/FunctionImplementation.h" #include "mlir/IR/Module.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; using namespace mlir::gpu; @@ -36,12 +37,34 @@ } void GPUDialect::initialize() { + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/GPUOps.cpp.inc" >(); } +Type GPUDialect::parseType(DialectAsmParser &parser) const { + // Parse the main keyword for the type. + StringRef keyword; + if (parser.parseKeyword(&keyword)) + return Type(); + MLIRContext *context = getContext(); + + // Handle 'async token' types. + if (keyword == "async.token") + return AsyncTokenType::get(context); + + parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); + return Type(); +} + +void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { + TypeSwitch(type) + .Case([&](Type) { os << "async.token"; }) + .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); }); +} + LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, NamedAttribute attr) { if (!attr.second.isa() || @@ -195,6 +218,26 @@ return success(); } +//===----------------------------------------------------------------------===// +// AsyncOpInterface +//===----------------------------------------------------------------------===// + +void gpu::addAsyncDependency(Operation *op, Value token) { + op->insertOperands(0, {token}); + if (!op->template hasTrait()) + return; + auto attrName = + OpTrait::AttrSizedOperandSegments::getOperandSegmentSizeAttr(); + auto sizeAttr = op->template getAttrOfType(attrName); + if (!sizeAttr) + return; // Async dependencies is the only variadic operand. + SmallVector sizes; + for (auto size : sizeAttr.getIntValues()) + sizes.push_back(size.getSExtValue()); + ++sizes.front(); + op->setAttr(attrName, Builder(op->getContext()).getI32VectorAttr(sizes)); +} + //===----------------------------------------------------------------------===// // LaunchOp //===----------------------------------------------------------------------===// @@ -283,22 +326,20 @@ // (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use) // where %size-* and %iter-* will correspond to the body region arguments. static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, - ValueRange operands, KernelDim3 ids) { + KernelDim3 operands, KernelDim3 ids) { p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in ("; - p << size.x << " = " << operands[0] << ", "; - p << size.y << " = " << operands[1] << ", "; - p << size.z << " = " << operands[2] << ')'; + p << size.x << " = " << operands.x << ", "; + p << size.y << " = " << operands.y << ", "; + p << size.z << " = " << operands.z << ')'; } static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { - ValueRange operands = op.getOperands(); - // Print the launch configuration. p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); - printSizeAssignment(p, op.getGridSize(), operands.take_front(3), + printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(), op.getBlockIds()); p << ' ' << op.getThreadsKeyword(); - printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), + printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(), op.getThreadIds()); p.printRegion(op.body(), /*printEntryBlockArgs=*/false); @@ -777,5 +818,23 @@ /*printBlockTerminators=*/false); } +static ParseResult parseAsyncKeyword(OpAsmParser &parser, + Type &asyncTokenType) { + auto loc = parser.getCurrentLocation(); + if (succeeded(parser.parseOptionalKeyword("async"))) { + if (parser.getNumResults() == 0) + return parser.emitError(loc, "needs to be named when marked 'async'"); + asyncTokenType = parser.getBuilder().getType(); + } + return success(); +} + +static void printAsyncKeyword(OpAsmPrinter &printer, Type asyncTokenType) { + if (asyncTokenType) + printer << "async "; +} + +#include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.cpp.inc" diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -435,3 +435,17 @@ } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 3: i64} : () -> () } } + +// ----- + +func @sync_wait_with_result() { + // expected-error @+1 {{cannot name an operation with no results}} + %t = gpu.wait +} + +// ----- + +func @async_wait_without_result() { + // expected-error @+1 {{custom op 'gpu.wait' needs to be named when marked 'async'}} + gpu.wait async +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -143,4 +143,27 @@ "gpu.return"() : () -> () } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 1: i64} : () -> () } + + func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token { + // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token) + // CHECK: return {{.*}} : !gpu.async.token + return %arg0 : !gpu.async.token + } + + func @async_wait() { + // CHECK-LABEL: func @async_wait + // CHECK: %[[t0:.*]] = gpu.wait async + %0 = gpu.wait async + // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]] + %1 = gpu.wait async [%0] + // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]] + %2 = gpu.wait async [%0, %1] + // CHECK: gpu.wait [%[[t0]], %[[t1]]] + // CHECK-NOT: async + gpu.wait [%0, %1] + // CHECK: gpu.wait + // CHECK-NOT: async + gpu.wait // Valid, but a no-op. + return + } }