diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1,6 +1,11 @@ add_mlir_dialect(GPUOps gpu) add_mlir_doc(GPUOps -gen-op-doc GPUOps Dialects/) +set(LLVM_TARGET_DEFINITIONS GPUBase.td) +mlir_tablegen(GPUOpInterfaces.h.inc -gen-op-interface-decls) +mlir_tablegen(GPUOpInterfaces.cpp.inc -gen-op-interface-defs) +add_public_tablegen_target(MLIRGPUOpInterfacesIncGen) + set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls) mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs) diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -53,4 +53,54 @@ }]; } +def GPU_AsyncToken : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token">, + BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">; + +def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> { + let description = [{ + Interface for GPU operations that execute asynchronously on the device. + }]; + let cppNamespace = "::mlir"; + + let methods = [ + InterfaceMethod<[{ + Query the operands that represent async dependency tokens. + }], + "OperandRange", "getAsyncDependencies", (ins), [{}], [{ + ConcreteOp op = cast(this->getOperation()); + return op.asyncDependencies(); + }] + >, + InterfaceMethod<[{ + Set the operands that represent async dependency tokens. + }], + "void", "setAsyncDependencies", (ins "ValueRange":$asyncDependencies), [{}], [{ + ConcreteOp op = cast(this->getOperation()); + auto asyncDepOdsIndex = 0; // Async dependencies always come first. + auto indexAndLength = op.getODSOperandIndexAndLength(asyncDepOdsIndex); + op.getOperation()->setOperands(indexAndLength.first, indexAndLength.second, asyncDependencies); + if constexpr (op.template hasTrait()) { + auto attrName = ConcreteOp::getOperandSegmentSizeAttr(); + auto sizeAttr = op.template getAttrOfType(attrName); + assert(sizeAttr && "operand segment size attribute not found"); + SmallVector sizes; + for (auto size : sizeAttr.getIntValues()) + sizes.push_back(size.getSExtValue()); + sizes[asyncDepOdsIndex] = asyncDependencies.size(); + op.setAttr(attrName, Builder(op.getContext()).getI32VectorAttr(sizes)); + } + }] + >, + InterfaceMethod<[{ + Query the result that represents the async token to depend on. + }], + "OpResult", "getAsyncToken", (ins), [{}], [{ + ConcreteOp op = cast(this->getOperation()); + return op.asyncToken().template cast(); + }] + >, + ]; +} + #endif // GPU_BASE diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -14,6 +14,7 @@ #ifndef MLIR_DIALECT_GPU_GPUDIALECT_H #define MLIR_DIALECT_GPU_GPUDIALECT_H +#include "mlir/IR/Builders.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/FunctionSupport.h" #include "mlir/IR/OpDefinition.h" @@ -34,13 +35,21 @@ Value z; }; +class AsyncTokenType + : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + } // end namespace gpu } // end namespace mlir #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc" +#include "mlir/Dialect/GPU/GPUOpInterfaces.h.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.h.inc" - #endif // MLIR_DIALECT_GPU_GPUDIALECT_H diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -249,7 +249,7 @@ return getBody().getNumArguments() - getType().getNumInputs() - getNumWorkgroupAttributions(); } - + /// Returns a list of block arguments that correspond to buffers located in /// the private memory. ArrayRef getPrivateAttributions() { @@ -301,7 +301,7 @@ IntOrIndex:$blockSizeY, IntOrIndex:$blockSizeZ, Variadic:$operands)>, Results<(outs)> { - let summary = "Launches a function as a GPU kerneel"; + let summary = "Launches a function as a GPU kernel"; let description = [{ Launch a kernel function on the specified grid of thread blocks. @@ -756,4 +756,21 @@ let verifier = [{ return success(); }]; } +def GPU_ForkOp : GPU_Op<"create_token", [GPU_AsyncOpInterface]> { + let summary = "Creates a GPU async token."; + let description = [{ + This op creates a new async token from a list of async dependencies. + + The op is inserted during lowering so that each async token is used only + once. This makes forks in async execution explicit. + + The op does not imply any host synchronization. + }]; + + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs GPU_AsyncToken:$asyncToken); + + let assemblyFormat = "(`[` $asyncDependencies^ `]`)? attr-dict"; +} + #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -10,6 +10,7 @@ DEPENDS MLIRGPUOpsIncGen + MLIRGPUOpInterfacesIncGen MLIRGPUPassIncGen MLIRParallelLoopMapperAttrGen MLIRParallelLoopMapperEnumsGen diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -16,12 +16,13 @@ #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" -#include "mlir/IR/Function.h" +#include "mlir/IR/DialectImplementation.h" #include "mlir/IR/FunctionImplementation.h" #include "mlir/IR/Module.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; using namespace mlir::gpu; @@ -36,12 +37,34 @@ } void GPUDialect::initialize() { + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/GPUOps.cpp.inc" >(); } +Type GPUDialect::parseType(DialectAsmParser &parser) const { + // Parse the main keyword for the type. + StringRef keyword; + if (parser.parseKeyword(&keyword)) + return Type(); + MLIRContext *context = getContext(); + + // Handle 'async token' types. + if (keyword == "async token") + return AsyncTokenType::get(context); + + parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); + return Type(); +} + +void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { + TypeSwitch(type) + .Case([&](Type) { os << "async token"; }) + .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); }); +} + LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, NamedAttribute attr) { if (!attr.second.isa() || @@ -283,22 +306,20 @@ // (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use) // where %size-* and %iter-* will correspond to the body region arguments. static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, - ValueRange operands, KernelDim3 ids) { + KernelDim3 operands, KernelDim3 ids) { p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in ("; - p << size.x << " = " << operands[0] << ", "; - p << size.y << " = " << operands[1] << ", "; - p << size.z << " = " << operands[2] << ')'; + p << size.x << " = " << operands.x << ", "; + p << size.y << " = " << operands.y << ", "; + p << size.z << " = " << operands.z << ')'; } static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { - ValueRange operands = op.getOperands(); - // Print the launch configuration. p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); - printSizeAssignment(p, op.getGridSize(), operands.take_front(3), + printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(), op.getBlockIds()); p << ' ' << op.getThreadsKeyword(); - printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), + printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(), op.getThreadIds()); p.printRegion(op.body(), /*printEntryBlockArgs=*/false); @@ -777,5 +798,7 @@ /*printBlockTerminators=*/false); } +#include "mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc" + #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.cpp.inc" diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -143,4 +143,15 @@ "gpu.return"() : () -> () } ) {gpu.kernel, sym_name = "kernel_1", type = (f32, memref) -> (), workgroup_attributions = 1: i64} : () -> () } + + func @async() { + // CHECK-LABEL: func @async + // CHECK: %[[t0:.*]] = gpu.create_token + %0 = gpu.create_token + // CHECK: %[[t1:.*]] = gpu.create_token[%[[t0]]] + %1 = gpu.create_token[%0] + // CHECK: gpu.create_token[%[[t0]], %[[t1]]] + %2 = gpu.create_token[%0, %1] + return + } }