diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md --- a/mlir/docs/Dialects/GPU.md +++ b/mlir/docs/Dialects/GPU.md @@ -36,6 +36,43 @@ complex lifetime analysis following the principles of MLIR that promote structure and representing analysis results in the IR. +## Queue management + +Some GPU dialect ops support optional `queue` argument. Queues can be mapped to +OpenCL/SYCL queues and allow more fine-grained control over kernel scheduling: +* Interleave execution on multiple devices. +* Have different queues for kernel execution and data copying. + +Example: +``` +%queue1 = gpu.create_queue "device1" +%queue2 = gpu.create_queue "device2" + +// Execute kernels on different devices +gpu.launch_func @kernels::@kernel_1 ... queue %queue1 +gpu.launch_func @kernels::@kernel_2 ... queue %queue2 +``` + +Queue doesn't introduce any additional synchronization semantics for the ops, +ops without async tokens will be executed immediately, ops with async tokens +will be executed according to the token dependencies. + +Asynchronous execution example: +``` +%queue1 = gpu.create_queue "device1" +%queue2 = gpu.create_queue "device2" + +%token1 = gpu.wait sync queue %queue1 +%token2 = gpu.wait sync queue %queue2 + +// Execute kernels in parallel +%token3 = gpu.launch_func async [%token1] @kernels::@kernel_1 ... queue %queue1 +%token4 = gpu.launch_func async [%token2] @kernels::@kernel_2 ... queue %queue2 + +gpu.wait [%token3] queue %queue1 +gpu.wait [%token4] queue %queue2 +``` + ## Operations [include "Dialects/GPUOps.md"] diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -95,6 +95,10 @@ GPU_Dialect, CPred<"::llvm::isa<::mlir::gpu::AsyncTokenType>($_self)">, "async token type">, BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">; +def GPU_Queue : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::QueueType>()">, "queue type">, + BuildableType<"mlir::gpu::QueueType::get($_builder.getContext())">; + // Predicat to check if type is gpu::MMAMatrixType. def IsMMAMatrixTypePred : CPred<"::llvm::isa<::mlir::gpu::MMAMatrixType>($_self)">; diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -46,6 +46,12 @@ using Base::Base; }; +class QueueType : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + /// MMAMatrixType storage and uniquing. Array is uniqued based on its shape /// and type. struct MMAMatrixStorageType : public TypeStorage { diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -436,7 +436,8 @@ Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, Optional:$dynamicSharedMemorySize, - Variadic:$kernelOperands)>, + Variadic:$kernelOperands, + Optional:$queue)>, Results<(outs Optional:$asyncToken)> { let summary = "Launches a function as a GPU kernel"; @@ -509,6 +510,7 @@ gpu.launch_func async // (Optional) Don't block host, return token. [%t0] // (Optional) Execute only after %t0 has completed. + <%queue> // (Optional) Execution queue. @kernels::@kernel_1 // Kernel function. blocks in (%cst, %cst, %cst) // Grid size. threads in (%cst, %cst, %cst) // Block size. @@ -523,12 +525,13 @@ let skipDefaultBuilders = 1; let builders = [ - OpBuilder<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, - "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize, + OpBuilder<(ins "GPUFuncOp":$kernelFunc, + "KernelDim3":$gridSize, "KernelDim3":$blockSize, + "Value":$dynamicSharedMemorySize, "ValueRange":$kernelOperands, CArg<"Type", "nullptr">:$asyncTokenType, - CArg<"ValueRange", "{}">:$asyncDependencies)> - ]; + CArg<"ValueRange", "{}">:$asyncDependencies, + CArg<"Value", "Value{}">:$queue)>]; let extraClassDeclaration = [{ /// The name of the kernel's containing module. @@ -557,6 +560,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) + (` ` `<` $queue^ `>`)? $kernel `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` @@ -1077,7 +1081,7 @@ let assemblyFormat = "$value attr-dict `:` type($value)"; } -def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { +def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "Wait for async gpu ops to complete."; let description = [{ This op synchronizes the host or the device with a list of dependent ops. @@ -1111,11 +1115,21 @@ ``` }]; - let arguments = (ins Variadic:$asyncDependencies); + let arguments = (ins Variadic:$asyncDependencies, + Optional:$queue); let results = (outs Optional:$asyncToken); + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<(ins + CArg<"Type", "nullptr">:$asyncTokenType, + CArg<"ValueRange", "{}">:$asyncDependencies, + CArg<"Value", "Value{}">:$queue)>]; + let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) attr-dict + custom(type($asyncToken), $asyncDependencies) + (` ` `<` $queue^ `>`)? attr-dict }]; let hasCanonicalizer = 1; @@ -1150,7 +1164,7 @@ let arguments = (ins Variadic:$asyncDependencies, Variadic:$dynamicSizes, Variadic:$symbolOperands, - UnitAttr:$hostShared); + UnitAttr:$hostShared, Optional:$queue); let results = (outs Res:$memref, Optional:$asyncToken); @@ -1158,16 +1172,31 @@ MemRefType getType() { return ::llvm::cast(getMemref().getType()); } }]; + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<(ins + "Type":$memref, + "Type":$asyncTokenType, + "ValueRange":$asyncDependencies, + "ValueRange":$dynamicSizes, + "ValueRange":$symbolOperands, + CArg<"bool", "false">:$hostShared, + CArg<"Value", "Value{}">:$queue)>]; + let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) (` ` `host_shared` $hostShared^)? ` ` - `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) + custom(type($asyncToken), $asyncDependencies) + (` ` `<` $queue^ `>`)? + (` ` `host_shared` $hostShared^)? ` ` + `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? + ` ` attr-dict `:` type($memref) }]; let hasVerifier = 1; let hasCanonicalizer = 1; } -def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> { +def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "GPU memory deallocation operation"; @@ -1191,16 +1220,27 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - Arg:$memref); + Arg:$memref, + Optional:$queue); let results = (outs Optional:$asyncToken); + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<(ins + "Type":$asyncTokenType, + "ValueRange":$asyncDependencies, + "Value":$memref, + CArg<"Value", "Value{}">:$queue)>]; + + let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $memref attr-dict `:` type($memref) + (` ` `<` $queue^ `>`)? $memref attr-dict `:` type($memref) }]; } -def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> { +def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> { let summary = "GPU memcpy operation"; @@ -1223,12 +1263,24 @@ let arguments = (ins Variadic:$asyncDependencies, Arg:$dst, - Arg:$src); + Arg:$src, + Optional:$queue); let results = (outs Optional:$asyncToken); + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<(ins + "Type":$asyncTokenType, + "ValueRange":$asyncDependencies, + "Value":$dst, + "Value":$src, + CArg<"Value", "Value{}">:$queue)>]; + let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $dst`,` $src `:` type($dst)`,` type($src) attr-dict + (` ` `<` $queue^ `>`)? $dst`,` $src `:` type($dst)`,` type($src) attr-dict + }]; let hasFolder = 1; let hasVerifier = 1; @@ -1236,7 +1288,8 @@ } def GPU_MemsetOp : GPU_Op<"memset", - [GPU_AsyncOpInterface, AllElementTypesMatch<["dst", "value"]>]> { + [GPU_AsyncOpInterface, AttrSizedOperandSegments, + AllElementTypesMatch<["dst", "value"]>]> { let summary = "GPU memset operation"; @@ -1259,12 +1312,13 @@ let arguments = (ins Variadic:$asyncDependencies, Arg:$dst, - Arg:$value); + Arg:$value, + Optional:$queue); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $dst`,` $value `:` type($dst)`,` type($value) attr-dict + (` ` `<` $queue^ `>`)? $dst`,` $value `:` type($dst)`,` type($value) attr-dict }]; let hasFolder = 1; } @@ -1738,7 +1792,7 @@ Index:$rows, Index:$cols, AnyMemRef:$memref); - let results = (outs Res:$spMat, + let results = (outs Res:$spMat, Optional:$asyncToken); let assemblyFormat = [{ @@ -2111,4 +2165,49 @@ }]; } +def GPU_CreateQueueOp : GPU_Op<"create_queue"> { + let description = [{ + The `gpu.create_queue` takes an optional attribute `desc` as input and + returns a new queue. `desc` meaning is implementation specific, e.g. it can + be a device name. + The queue is then used for launching/queuing kernels + on the GPU. + + Example: + + ```mlir + %queue = gpu.create_queue "device1" + + OR + + %queue = gpu.create_queue + ``` + + }]; + + let skipDefaultBuilders = 1; + + let arguments = (ins OptionalAttr : $desc); + let results = (outs GPU_Queue : $queue); + let builders = [OpBuilder<(ins CArg<"Attribute", "{}">:$desc)>]; + + let assemblyFormat = "($desc^)? attr-dict"; +} + +def GPU_DestroyQueueOp : GPU_Op<"destroy_queue"> { + let description = [{ + The `gpu.destroy_queue` op destoys the GPU queue. + + Example: + + ```mlir + gpu.destroy_queue %queue + ``` + }]; + + let arguments = (ins GPU_Queue : $queue); + + let assemblyFormat = "attr-dict $queue"; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -867,6 +867,10 @@ return rewriter.notifyMatchFailure( allocOp, "host_shared allocation is not supported"); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(allocOp, + "Queue arg is not supported yet."); + MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || @@ -913,6 +917,10 @@ failed(isAsyncWithOneDependency(rewriter, deallocOp))) return failure(); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(deallocOp, + "Queue arg is not supported yet."); + Location loc = deallocOp.getLoc(); Value pointer = @@ -979,6 +987,10 @@ if (waitOp.getAsyncToken()) return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(waitOp, + "Queue arg is not supported yet."); + Location loc = waitOp.getLoc(); for (auto operand : adaptor.getOperands()) { @@ -1154,6 +1166,10 @@ if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter))) return failure(); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(launchOp, + "Queue arg is not supported yet."); + if (launchOp.getAsyncDependencies().size() > 1) return rewriter.notifyMatchFailure( launchOp, "Cannot convert with more than one async dependency."); @@ -1258,6 +1274,10 @@ failed(isAsyncWithOneDependency(rewriter, memcpyOp))) return failure(); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(memcpyOp, + "Queue arg is not supported yet."); + auto loc = memcpyOp.getLoc(); MemRefDescriptor srcDesc(adaptor.getSrc()); @@ -1298,6 +1318,10 @@ failed(isAsyncWithOneDependency(rewriter, memsetOp))) return failure(); + if (adaptor.getQueue()) + return rewriter.notifyMatchFailure(memsetOp, + "Queue arg is not supported yet."); + auto loc = memsetOp.getLoc(); Type valueType = adaptor.getValue().getType(); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -148,6 +148,7 @@ addTypes(); addTypes(); addTypes(); + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" @@ -181,6 +182,9 @@ if (keyword == "async.token") return AsyncTokenType::get(context); + if (keyword == "queue") + return QueueType::get(context); + if (keyword == "mma_matrix") { SMLoc beginLoc = parser.getNameLoc(); @@ -231,6 +235,7 @@ }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); }) + .Case([&](Type) { os << "queue"; }) .Case([&](MMAMatrixType fragTy) { os << "mma_matrix<"; auto shape = fragTy.getShape(); @@ -906,6 +911,18 @@ return getBody().addArgument(type, loc); } +//===----------------------------------------------------------------------===// +// CreateQueueOp +//===----------------------------------------------------------------------===// + +void CreateQueueOp::build(OpBuilder &odsBuilder, OperationState &odsState, + /*optional*/ Attribute desc) { + if (desc) + odsState.getOrAddProperties().desc = desc; + + odsState.addTypes(QueueType::get(odsBuilder.getContext())); +} + //===----------------------------------------------------------------------===// // LaunchFuncOp //===----------------------------------------------------------------------===// @@ -914,7 +931,7 @@ GPUFuncOp kernelFunc, KernelDim3 gridSize, KernelDim3 getBlockSize, Value dynamicSharedMemorySize, ValueRange kernelOperands, Type asyncTokenType, - ValueRange asyncDependencies) { + ValueRange asyncDependencies, Value queue) { result.addOperands(asyncDependencies); if (asyncTokenType) result.types.push_back(builder.getType()); @@ -930,10 +947,16 @@ SymbolRefAttr::get(kernelModule.getNameAttr(), {SymbolRefAttr::get(kernelFunc.getNameAttr())}); result.addAttribute(getKernelAttrName(result.name), kernelSymbol); - SmallVector segmentSizes(9, 1); + + if (queue) + result.addOperands(queue); + + SmallVector segmentSizes(10, 1); segmentSizes.front() = asyncDependencies.size(); - segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0; - segmentSizes.back() = static_cast(kernelOperands.size()); + segmentSizes[segmentSizes.size() - 3] = dynamicSharedMemorySize ? 1 : 0; + segmentSizes[segmentSizes.size() - 2] = + static_cast(kernelOperands.size()); + segmentSizes.back() = queue ? 1 : 0; result.addAttribute(getOperandSegmentSizeAttr(), builder.getDenseI32ArrayAttr(segmentSizes)); } @@ -1542,6 +1565,22 @@ results.add(context); } +void MemcpyOp::build(OpBuilder &odsBuilder, OperationState &odsState, + /*optional*/ Type asyncToken, ValueRange asyncDependencies, + Value dst, Value src, /*optional*/ Value queue) { + odsState.addOperands(asyncDependencies); + odsState.addOperands(dst); + odsState.addOperands(src); + if (queue) + odsState.addOperands(queue); + (odsState.getOrAddProperties().operand_segment_sizes = + odsBuilder.getDenseI32ArrayAttr( + {static_cast(asyncDependencies.size()), 1, 1, + (queue ? 1 : 0)})); + if (asyncToken) + odsState.addTypes(asyncToken); +} + //===----------------------------------------------------------------------===// // GPU_SubgroupMmaLoadMatrixOp //===----------------------------------------------------------------------===// @@ -1638,6 +1677,23 @@ // GPU_WaitOp //===----------------------------------------------------------------------===// +void WaitOp::build(OpBuilder &builder, OperationState &result, + Type asyncTokenType, ValueRange asyncDependencies, + Value queue) { + result.addOperands(asyncDependencies); + if (asyncTokenType) + result.types.push_back(builder.getType()); + + if (queue) + result.addOperands(queue); + + SmallVector segmentSizes(2, 1); + segmentSizes.front() = asyncDependencies.size(); + segmentSizes.back() = queue ? 1 : 0; + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr(segmentSizes)); +} + namespace { /// Remove gpu.wait op use of gpu.wait op def without async dependencies. @@ -1651,17 +1707,21 @@ PatternRewriter &rewriter) const final { auto predicate = [](Value value) { auto waitOp = value.getDefiningOp(); - return waitOp && waitOp->getNumOperands() == 0; + return waitOp && waitOp.getAsyncDependencies().size() == 0; }; + if (llvm::none_of(op.getAsyncDependencies(), predicate)) return failure(); + SmallVector validOperands; - for (Value operand : op->getOperands()) { + for (Value operand : op.getAsyncDependencies()) { if (predicate(operand)) continue; validOperands.push_back(operand); } - rewriter.updateRootInPlace(op, [&]() { op->setOperands(validOperands); }); + + rewriter.updateRootInPlace( + op, [&]() { op.getAsyncDependenciesMutable().assign(validOperands); }); return success(); } }; @@ -1765,6 +1825,51 @@ results.add(context); } +void AllocOp::build(OpBuilder &odsBuilder, OperationState &odsState, + Type memref, + /*optional*/ Type asyncToken, ValueRange asyncDependencies, + ValueRange dynamicSizes, ValueRange symbolOperands, + /*optional*/ bool hostShared, + /*optional*/ Value queue) { + odsState.addOperands(asyncDependencies); + odsState.addOperands(dynamicSizes); + odsState.addOperands(symbolOperands); + if (queue) + odsState.addOperands(queue); + (odsState.getOrAddProperties().operand_segment_sizes = + odsBuilder.getDenseI32ArrayAttr( + {static_cast(asyncDependencies.size()), + static_cast(dynamicSizes.size()), + static_cast(symbolOperands.size()), (queue ? 1 : 0)})); + if (hostShared) { + odsState.getOrAddProperties().hostShared = + ((hostShared) ? odsBuilder.getUnitAttr() : nullptr); + } + odsState.addTypes(memref); + if (asyncToken) + odsState.addTypes(asyncToken); +} + +//===----------------------------------------------------------------------===// +// GPU_DeallocOp +//===----------------------------------------------------------------------===// + +void DeallocOp::build(OpBuilder &odsBuilder, OperationState &odsState, + /*optional*/ Type asyncToken, + ValueRange asyncDependencies, Value memref, + /*optional*/ Value queue) { + odsState.addOperands(asyncDependencies); + odsState.addOperands(memref); + if (queue) + odsState.addOperands(queue); + (odsState.getOrAddProperties().operand_segment_sizes = + odsBuilder.getDenseI32ArrayAttr( + {static_cast(asyncDependencies.size()), 1, + (queue ? 1 : 0)})); + if (asyncToken) + odsState.addTypes(asyncToken); +} + #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" #include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc" diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -147,8 +147,9 @@ dynamicSizes.push_back(dimOp); } } - return builder.create(loc, TypeRange({memTp, token.getType()}), - token, dynamicSizes, ValueRange()); + return builder.create(loc, memTp, token.getType(), token, + dynamicSizes, + /*symbolOperands*/ ValueRange()); } // Allocates a void buffer on the device with given size. @@ -156,8 +157,8 @@ Value token) { const auto memTp = MemRefType::get({ShapedType::kDynamic}, builder.getI8Type()); - return builder.create(loc, TypeRange({memTp, token.getType()}), - token, size, ValueRange()); + return builder.create(loc, memTp, token.getType(), token, size, + /*symbolOperands*/ ValueRange()); } /// Deallocates memory from the device. diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -131,9 +131,16 @@ %c0 = arith.constant 0 : i32 %t0 = gpu.wait async + // CHECK: %[[queue:.*]] = gpu.create_queue + %queue = gpu.create_queue + // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref) + gpu.launch_func <%queue> @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref) + + gpu.launch_func <%queue> @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref) + gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref) // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) @@ -147,6 +154,8 @@ // CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%values#0 : f32, %values#1 : memref) + // CHECK: gpu.destroy_queue %[[queue]] + gpu.destroy_queue %queue return } @@ -218,6 +227,7 @@ func.func @alloc() { // CHECK-LABEL: func @alloc() + // CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1> %m0 = gpu.alloc () : memref<13xf32, 1> // CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1> @@ -234,6 +244,13 @@ // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1> gpu.dealloc %m2 : memref<13xf32, 1> + // CHECK: %[[queue:.*]] = gpu.create_queue + %queue = gpu.create_queue + // CHECK: %[[m3:.*]] = gpu.alloc <%[[queue]]> () : memref<13xf32, 1> + %m3 = gpu.alloc <%queue> () : memref<13xf32, 1> + // CHECK: gpu.dealloc <%[[queue]]> %[[m3]] : memref<13xf32, 1> + gpu.dealloc <%queue> %m3 : memref<13xf32, 1> + return } @@ -268,6 +285,16 @@ %0 = gpu.wait async // CHECK: {{.*}} = gpu.memcpy async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> %1 = gpu.memcpy async [%0] %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> + + // CHECK: %[[queue:.*]] = gpu.create_queue + %queue = gpu.create_queue + // CHECK: gpu.memcpy <%[[queue]]> {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> + gpu.memcpy <%queue> %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> + // CHECK: %[[t1:.*]] = gpu.wait async <%[[queue]]> + %2 = gpu.wait async <%queue> + // CHECK: {{.*}} = gpu.memcpy async [%[[t1]]] <%[[queue]]> {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> + %3 = gpu.memcpy async [%2] <%queue> %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> + return } @@ -279,6 +306,16 @@ %0 = gpu.wait async // CHECK: {{.*}} = gpu.memset async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, f32 %1 = gpu.memset async [%0] %dst, %value : memref<3x7xf32>, f32 + + // CHECK: %[[queue:.*]] = gpu.create_queue + %queue = gpu.create_queue + // CHECK: gpu.memset <%[[queue]]> {{.*}}, {{.*}} : memref<3x7xf32>, f32 + gpu.memset <%queue> %dst, %value : memref<3x7xf32>, f32 + // CHECK: %[[t1:.*]] = gpu.wait async <%[[queue]]> + %2 = gpu.wait async <%queue> + // CHECK: {{.*}} = gpu.memset async [%[[t1]]] <%[[queue]]> {{.*}}, {{.*}} : memref<3x7xf32>, f32 + %3 = gpu.memset async [%2] <%queue> %dst, %value : memref<3x7xf32>, f32 + return } @@ -356,6 +393,13 @@ gpu.wait [%token16] return } + + // CHECK-LABEL: func @create_queue_with_desc + func.func @create_queue_with_desc() { + // CHECK: gpu.create_queue "test" + %queue = gpu.create_queue "test" + return + } } // Just check that this doesn't crash.