diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -94,6 +94,14 @@ GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token type">, BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">; +def GPU_Stream : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::StreamType>()">, "stream type">, + BuildableType<"mlir::gpu::StreamType::get($_builder.getContext())">; + +def GPU_Device : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::DeviceType>()">, "device type">, + BuildableType<"mlir::gpu::DeviceType::get($_builder.getContext())">; + // Predicat to check if type is gpu::MMAMatrixType. def IsMMAMatrixTypePred : CPred<"$_self.isa<::mlir::gpu::MMAMatrixType>()">; diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -45,6 +45,18 @@ using Base::Base; }; +class StreamType : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + +class DeviceType : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + /// MMAMatrixType storage and uniquing. Array is uniqued based on its shape /// and type. struct MMAMatrixStorageType : public TypeStorage { diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -378,7 +378,8 @@ Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Optional:$dynamicSharedMemorySize, - Variadic:$kernelOperands)>, + Variadic:$kernelOperands, + Optional:$stream)>, Results<(outs Optional:$asyncToken)> { let summary = "Launches a function as a GPU kernel"; @@ -465,12 +466,13 @@ let skipDefaultBuilders = 1; let builders = [ - OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, - "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, + OpBuilder<(ins "GPUFuncOp":$kernelFunc, + "KernelDim3":$gridSize, "KernelDim3":$blockSize, + "Value":$dynamicSharedMemorySize, "ValueRange":$kernelOperands, CArg<"Type", "nullptr">:$asyncTokenType, - CArg<"ValueRange", "{}">:$asyncDependencies)> - ]; + CArg<"ValueRange", "{}">:$asyncDependencies, + CArg<"Value", "Value{}">:$stream)>]; let extraClassDeclaration = [{ /// The name of the kernel's containing module. @@ -502,6 +504,7 @@ $kernel `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` + (`stream` $stream^)? (`dynamic_shared_memory_size` $dynamicSharedMemorySize^)? custom($kernelOperands, type($kernelOperands)) attr-dict }]; @@ -929,7 +932,7 @@ let assemblyFormat = "$value attr-dict `:` type($value)"; } -def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { +def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "Wait for async gpu ops to complete."; let description = [{ This op synchronizes the host or the device with a list of dependent ops. @@ -963,11 +966,21 @@ ``` }]; - let arguments = (ins Variadic:$asyncDependencies); + let arguments = (ins Variadic:$asyncDependencies, + Optional:$stream); let results = (outs Optional:$asyncToken); + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<(ins + CArg<"Type", "nullptr">:$asyncTokenType, + CArg<"ValueRange", "{}">:$asyncDependencies, + CArg<"Value", "Value{}">:$stream)>]; + let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) attr-dict + (`stream` $stream^)? }]; let hasCanonicalizer = 1; @@ -1002,7 +1015,7 @@ let arguments = (ins Variadic:$asyncDependencies, Variadic:$dynamicSizes, Variadic:$symbolOperands, - UnitAttr:$hostShared); + UnitAttr:$hostShared, Optional:$stream); let results = (outs Res:$memref, Optional:$asyncToken); @@ -1011,15 +1024,17 @@ }]; let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) (` ` `host_shared` $hostShared^)? ` ` - `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) + custom(type($asyncToken), $asyncDependencies) + (` ` `host_shared` $hostShared^)? ` ` + `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? + (` ` `stream` $stream^)? ` ` attr-dict `:` type($memref) }]; let hasVerifier = 1; let hasCanonicalizer = 1; } -def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> { +def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "GPU memory deallocation operation"; @@ -1043,16 +1058,18 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - Arg:$memref); + Arg:$memref, + Optional:$stream); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $memref attr-dict `:` type($memref) + $memref + (`stream` $stream^)? attr-dict `:` type($memref) }]; } -def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> { +def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "GPU memcpy operation"; @@ -1075,12 +1092,14 @@ let arguments = (ins Variadic:$asyncDependencies, Arg:$dst, - Arg:$src); + Arg:$src, + Optional:$stream); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $dst`,` $src `:` type($dst)`,` type($src) attr-dict + $dst`,` $src (`stream` $stream^)? `:` type($dst)`,` type($src) attr-dict + }]; let hasFolder = 1; let hasVerifier = 1; @@ -1088,7 +1107,8 @@ } def GPU_MemsetOp : GPU_Op<"memset", - [GPU_AsyncOpInterface, AllElementTypesMatch<["dst", "value"]>]> { + [GPU_AsyncOpInterface, AttrSizedOperandSegments, + AllElementTypesMatch<["dst", "value"]>]> { let summary = "GPU memset operation"; @@ -1111,12 +1131,13 @@ let arguments = (ins Variadic:$asyncDependencies, Arg:$dst, - Arg:$value); + Arg:$value, + Optional:$stream); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $dst`,` $value `:` type($dst)`,` type($value) attr-dict + $dst`,` $value (`stream` $stream^)? `:` type($dst)`,` type($value) attr-dict }]; let hasFolder = 1; } @@ -1388,4 +1409,38 @@ }]; } + +def GPU_CreateStreamOp : GPU_Op<"create_stream", [SameVariadicOperandSize]> { + + let description = [{ + The `gpu.create_stream` takes an optional argument `device` as input and + returns a stream, based on the device. If no device is provided, a default + device will be created by the underlying runtime. + The stream is then used for launching/queuing kernels + on the GPU. + + Example: + + ```mlir + %stream = gpu.create_stream %device : !gpu.stream + + OR + + %stream = gpu.create_stream : !gpu.stream + ``` + + }]; + + let skipDefaultBuilders = 1; + + let arguments = (ins Optional : $device); + let results = (outs GPU_Stream : $stream); + let builders = [OpBuilder<(ins CArg<"Value", "Value{}">:$device)>]; + + let assemblyFormat = [{ + ($device^)? attr-dict `:` type($stream) + }]; + +} + #endif // GPU_OPS diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -142,6 +142,8 @@ void GPUDialect::initialize() { addTypes(); addTypes(); + addTypes(); + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" @@ -164,6 +166,12 @@ if (keyword == "async.token") return AsyncTokenType::get(context); + if (keyword == "stream") + return StreamType::get(context); + + if (keyword == "device") + return DeviceType::get(context); + if (keyword == "mma_matrix") { SMLoc beginLoc = parser.getNameLoc(); @@ -203,6 +211,8 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { TypeSwitch(type) .Case([&](Type) { os << "async.token"; }) + .Case([&](Type) { os << "stream"; }) + .Case([&](Type) { os << "device"; }) .Case([&](MMAMatrixType fragTy) { os << "mma_matrix<"; auto shape = fragTy.getShape(); @@ -725,6 +735,19 @@ rewrites.add(context); } +//===----------------------------------------------------------------------===// +// CreateStreamOp +//===----------------------------------------------------------------------===// + +void CreateStreamOp::build(OpBuilder &odsBuilder, OperationState &result, + Value device) { + if (device) + result.addOperands(device); + + SmallVector segmentSizes(1, 1); + segmentSizes.front() = device ? 1 : 0; +} + //===----------------------------------------------------------------------===// // LaunchFuncOp //===----------------------------------------------------------------------===// @@ -733,7 +756,7 @@ GPUFuncOp kernelFunc, KernelDim3 gridSize, KernelDim3 getBlockSize, Value dynamicSharedMemorySize, ValueRange kernelOperands, Type asyncTokenType, - ValueRange asyncDependencies) { + ValueRange asyncDependencies, Value stream) { result.addOperands(asyncDependencies); if (asyncTokenType) result.types.push_back(builder.getType()); @@ -749,10 +772,16 @@ SymbolRefAttr::get(kernelModule.getNameAttr(), {SymbolRefAttr::get(kernelFunc.getNameAttr())}); result.addAttribute(getKernelAttrName(result.name), kernelSymbol); - SmallVector segmentSizes(9, 1); + + if (stream) + result.addOperands(stream); + + SmallVector segmentSizes(10, 1); segmentSizes.front() = asyncDependencies.size(); - segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0; - segmentSizes.back() = static_cast(kernelOperands.size()); + segmentSizes[segmentSizes.size() - 3] = dynamicSharedMemorySize ? 1 : 0; + segmentSizes[segmentSizes.size() - 2] = + static_cast(kernelOperands.size()); + segmentSizes.back() = stream ? 1 : 0; result.addAttribute(getOperandSegmentSizeAttr(), builder.getDenseI32ArrayAttr(segmentSizes)); } @@ -1314,6 +1343,23 @@ // GPU_WaitOp //===----------------------------------------------------------------------===// +void WaitOp::build(OpBuilder &builder, OperationState &result, + Type asyncTokenType, ValueRange asyncDependencies, + Value stream) { + result.addOperands(asyncDependencies); + if (asyncTokenType) + result.types.push_back(builder.getType()); + + if (stream) + result.addOperands(stream); + + SmallVector segmentSizes(2, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = stream ? 1 : 0; + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr(segmentSizes)); +} + namespace { /// Remove gpu.wait op use of gpu.wait op def without async dependencies. @@ -1327,17 +1373,21 @@ PatternRewriter &rewriter) const final { auto predicate = [](Value value) { auto waitOp = value.getDefiningOp(); - return waitOp && waitOp->getNumOperands() == 0; + return waitOp && waitOp.getAsyncDependencies().size() == 0; }; + if (llvm::none_of(op.getAsyncDependencies(), predicate)) return failure(); + SmallVector validOperands; - for (Value operand : op->getOperands()) { + for (Value operand : op.getAsyncDependencies()) { if (predicate(operand)) continue; validOperands.push_back(operand); } - rewriter.updateRootInPlace(op, [&]() { op->setOperands(validOperands); }); + + rewriter.updateRootInPlace( + op, [&]() { op.getAsyncDependenciesMutable().assign(validOperands); }); return success(); } }; diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -131,9 +131,16 @@ %c0 = arith.constant 0 : i32 %t0 = gpu.wait async + // CHECK: %[[stream:.*]] = gpu.create_stream : !gpu.stream + %stream = gpu.create_stream : !gpu.stream + // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref) + gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) stream %stream args(%0 : f32, %1 : memref) + + gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) stream %stream dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref) + gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref) // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) @@ -218,6 +225,7 @@ func.func @alloc() { // CHECK-LABEL: func @alloc() + // CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1> %m0 = gpu.alloc () : memref<13xf32, 1> // CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1> @@ -234,6 +242,13 @@ // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1> gpu.dealloc %m2 : memref<13xf32, 1> + // CHECK: %[[stream:.*]] = gpu.create_stream : !gpu.stream + %stream = gpu.create_stream : !gpu.stream + // CHECK: %[[m3:.*]] = gpu.alloc () stream %[[stream]] : memref<13xf32, 1> + %m3 = gpu.alloc () stream %stream : memref<13xf32, 1> + // CHECK: gpu.dealloc %[[m3]] stream %[[stream]] : memref<13xf32, 1> + gpu.dealloc %m3 stream %stream : memref<13xf32, 1> + return } @@ -268,6 +283,16 @@ %0 = gpu.wait async // CHECK: {{.*}} = gpu.memcpy async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> %1 = gpu.memcpy async [%0] %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> + + // CHECK: %[[stream:.*]] = gpu.create_stream : !gpu.stream + %stream = gpu.create_stream : !gpu.stream + // CHECK: gpu.memcpy {{.*}}, {{.*}} stream %[[stream]] : memref<3x7xf32>, memref<3x7xf32, 1> + gpu.memcpy %dst, %src stream %stream : memref<3x7xf32>, memref<3x7xf32, 1> + // CHECK: %[[t1:.*]] = gpu.wait async stream %[[stream]] + %2 = gpu.wait async stream %stream + // CHECK: {{.*}} = gpu.memcpy async [%[[t1]]] {{.*}}, {{.*}} stream %[[stream]] : memref<3x7xf32>, memref<3x7xf32, 1> + %3 = gpu.memcpy async [%2] %dst, %src stream %stream : memref<3x7xf32>, memref<3x7xf32, 1> + return } @@ -279,6 +304,16 @@ %0 = gpu.wait async // CHECK: {{.*}} = gpu.memset async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, f32 %1 = gpu.memset async [%0] %dst, %value : memref<3x7xf32>, f32 + + // CHECK: %[[stream:.*]] = gpu.create_stream : !gpu.stream + %stream = gpu.create_stream : !gpu.stream + // CHECK: gpu.memset {{.*}}, {{.*}} stream %[[stream]] : memref<3x7xf32>, f32 + gpu.memset %dst, %value stream %stream : memref<3x7xf32>, f32 + // CHECK: %[[t1:.*]] = gpu.wait async stream %[[stream]] + %2 = gpu.wait async stream %stream + // CHECK: {{.*}} = gpu.memset async [%[[t1]]] {{.*}}, {{.*}} stream %[[stream]] : memref<3x7xf32>, f32 + %3 = gpu.memset async [%2] %dst, %value stream %stream : memref<3x7xf32>, f32 + return }