diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -536,12 +536,14 @@ to the amount of dynamic shared memory a kernel's workgroup should be allocated; when this operand is not present, a zero size is assumed. - The body region has _twelve_ arguments, grouped as follows: + The body region has at least _twelve_ arguments, grouped as follows: - three arguments that contain block identifiers along x,y,z dimensions; - three arguments that contain thread identifiers along x,y,z dimensions; - operands of the `gpu.launch` operation as is (i.e. the operands for grid and block sizes). + - a variadic number of Workgroup memory attributions. + - a variadic number of Private memory attributions. Syntax: @@ -550,8 +552,11 @@ `block` `(` ssa-id-list `)` `in` ssa-reassignment `threads` `(` ssa-id-list `)` `in` ssa-reassignment (dynamic_shared_memory_size ssa-use)? + memory-attribution region attr-dict? ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` + memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? + (`private` `(` ssa-id-and-type-list `)`)? ``` Example: @@ -582,6 +587,18 @@ "some_op"(%bx, %tx) : (index, index) -> () %3 = "memref.load"(%val1, %bx) : (memref, index) -> f32 } + + // Launch with memory attributions. + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) + threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) + workgroup(%workgroup: memref<32xf32, 3>) + private(%private: memref<1xf32, 5>) { + // Block and thread identifiers, as well as block/grid sizes are + // immediately usable inside body region. + "some_op"(%bx, %tx) : (index, index) -> () + // Loading a value from workgroup memory. + %42 = load %workgroup[%bx] : memref<32xf32, 3> + } ``` Rationale: using operation/block arguments gives analyses a clear way of @@ -601,7 +618,9 @@ "Value":$blockSizeZ, CArg<"Value", "nullptr">:$dynamicSharedMemorySize, CArg<"Type", "nullptr">:$asyncTokenType, - CArg<"ValueRange", "{}">:$asyncDependencies)> + CArg<"ValueRange", "{}">:$asyncDependencies, + CArg<"TypeRange", "{}">:$workgroupAttributions, + CArg<"TypeRange", "{}">:$privateAttributions)> ]; let extraClassDeclaration = [{ @@ -632,6 +651,57 @@ /// The number of region attributes containing the launch configuration, /// placed in the leading positions of the argument list. static constexpr unsigned kNumConfigRegionAttributes = 12; + + /// Returns the keywords used in the custom syntax for this Op. + static StringRef getWorkgroupKeyword() { return "workgroup"; } + static StringRef getPrivateKeyword() { return "private"; } + + /// Returns the number of buffers located in the workgroup memory. + unsigned getNumWorkgroupAttributions() { + auto attr = (*this)->getAttrOfType( + getNumWorkgroupAttributionsAttrName()); + return attr ? attr.getInt() : 0; + } + + /// Returns a list of block arguments that correspond to buffers located in + /// the workgroup memory + ArrayRef getWorkgroupAttributions() { + auto begin = + std::next(getBody().args_begin(), kNumConfigRegionAttributes); + auto end = std::next(begin, getNumWorkgroupAttributions()); + return {begin, end}; + } + + /// Adds a new block argument that corresponds to buffers located in + /// workgroup memory. + BlockArgument addWorkgroupAttribution(Type type, Location loc); + + /// Returns the number of buffers located in the private memory. + unsigned getNumPrivateAttributions() { + return getBody().getNumArguments() - kNumConfigRegionAttributes - + getNumWorkgroupAttributions(); + } + + /// Returns a list of block arguments that correspond to buffers located in + /// the private memory. + ArrayRef getPrivateAttributions() { + // Buffers on the private memory always come after buffers on the workgroup + // memory. + auto begin = + std::next(getBody().args_begin(), + kNumConfigRegionAttributes + getNumWorkgroupAttributions()); + return {begin, getBody().args_end()}; + } + + /// Adds a new block argument that corresponds to buffers located in + /// private memory. + BlockArgument addPrivateAttribution(Type type, Location loc); + + /// Returns the name of the attribute containing the number of buffers + /// located in the workgroup memory. + static StringRef getNumWorkgroupAttributionsAttrName() { + return "workgroup_attributions"; + } }]; let hasCanonicalizer = 1; diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -332,6 +332,60 @@ printer << ']'; } +// GPU Memory attributions functions shared by LaunchOp and GPUFuncOp. +/// Parses a GPU function memory attribution. +/// +/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? +/// (`private` `(` ssa-id-and-type-list `)`)? +/// +/// Note that this function parses only one of the two similar parts, with the +/// keyword provided as argument. +static ParseResult +parseAttributions(OpAsmParser &parser, StringRef keyword, + SmallVectorImpl &args) { + // If we could not parse the keyword, just assume empty list and succeed. + if (failed(parser.parseOptionalKeyword(keyword))) + return success(); + + return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren, + /*allowType=*/true); +} + +/// Prints a GPU function memory attribution. +static void printAttributions(OpAsmPrinter &p, StringRef keyword, + ArrayRef values) { + if (values.empty()) + return; + + p << ' ' << keyword << '('; + llvm::interleaveComma( + values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); }); + p << ')'; +} + +/// Verifies a GPU function memory attribution. +static LogicalResult verifyAttributions(Operation *op, + ArrayRef attributions, + gpu::AddressSpace memorySpace) { + for (Value v : attributions) { + auto type = v.getType().dyn_cast(); + if (!type) + return op->emitOpError() << "expected memref type in attribution"; + + // We can only verify the address space if it hasn't already been lowered + // from the AddressSpaceAttr to a target-specific numeric value. + auto addressSpace = + type.getMemorySpace().dyn_cast_or_null(); + if (!addressSpace) + continue; + if (addressSpace.getValue() != memorySpace) + return op->emitOpError() + << "expected memory space " << stringifyAddressSpace(memorySpace) + << " in attribution"; + } + return success(); +} + //===----------------------------------------------------------------------===// // AllReduceOp //===----------------------------------------------------------------------===// @@ -439,7 +493,15 @@ Value gridSizeX, Value gridSizeY, Value gridSizeZ, Value getBlockSizeX, Value getBlockSizeY, Value getBlockSizeZ, Value dynamicSharedMemorySize, - Type asyncTokenType, ValueRange asyncDependencies) { + Type asyncTokenType, ValueRange asyncDependencies, + TypeRange workgroupAttributions, + TypeRange privateAttributions) { + // Add a WorkGroup attribution attribute. This attribute is required to + // identify private attributions in the list of block argguments. + result.addAttribute(getNumWorkgroupAttributionsAttrName(), + builder.getI64IntegerAttr(workgroupAttributions.size())); + + // Add Op operands. result.addOperands(asyncDependencies); if (asyncTokenType) result.types.push_back(builder.getType()); @@ -450,14 +512,21 @@ if (dynamicSharedMemorySize) result.addOperands(dynamicSharedMemorySize); - // Create a kernel body region with kNumConfigRegionAttributes + N arguments, - // where the first kNumConfigRegionAttributes arguments have `index` type and - // the rest have the same types as the data operands. + // Create a kernel body region with kNumConfigRegionAttributes + N memory + // attributions, where the first kNumConfigRegionAttributes arguments have + // `index` type and the rest have the same types as the data operands. Region *kernelRegion = result.addRegion(); Block *body = new Block(); + // TODO: Allow passing in proper locations here. for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i) body->addArgument(builder.getIndexType(), result.location); + // Add WorkGroup & Private attributions to the region arguments. + for (Type argTy : workgroupAttributions) + body->addArgument(argTy, result.location); + for (Type argTy : privateAttributions) + body->addArgument(argTy, result.location); kernelRegion->push_back(body); + // Fill OperandSegmentSize Attribute. SmallVector segmentSizes(8, 1); segmentSizes.front() = asyncDependencies.size(); segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; @@ -504,13 +573,18 @@ // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. if (!getBody().empty()) { - if (getBody().getNumArguments() != - LaunchOp::kNumConfigOperands + getNumOperands() - - (getDynamicSharedMemorySize() ? 1 : 0) - - getAsyncDependencies().size()) + if (getBody().getNumArguments() < + kNumConfigRegionAttributes + getNumWorkgroupAttributions()) return emitOpError("unexpected number of region arguments"); } + // Verify Attributions Address Spaces. + if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(), + GPUDialect::getWorkgroupAddressSpace())) || + failed(verifyAttributions(getOperation(), getPrivateAttributions(), + GPUDialect::getPrivateAddressSpace()))) + return failure(); + // Block terminators without successors are expected to exit the kernel region // and must be `gpu.terminator`. for (Block &block : getBody()) { @@ -563,10 +637,15 @@ p << ' ' << getDynamicSharedMemorySizeKeyword() << ' ' << getDynamicSharedMemorySize(); + printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions()); + printAttributions(p, getPrivateKeyword(), getPrivateAttributions()); + p << ' '; + p.printRegion(getBody(), /*printEntryBlockArgs=*/false); p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{ - LaunchOp::getOperandSegmentSizeAttr()}); + LaunchOp::getOperandSegmentSizeAttr(), + getNumWorkgroupAttributionsAttrName()}); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -601,8 +680,9 @@ /// Parses a Launch operation. /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? -// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment +/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment /// `threads` `(` ssa-id-list `)` `in` ssa-reassignment +/// memory-attribution /// region attr-dict? /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { @@ -659,9 +739,12 @@ return failure(); } - // Introduce the body region and parse it. The region has - // kNumConfigRegionAttributes arguments that correspond to - // block/thread identifiers and grid/block sizes, all of the `index` type. + // Create the region arguments, it has kNumConfigRegionAttributes arguments + // that correspond to block/thread identifiers and grid/block sizes, all of + // the `index` type, a variadic number of WorkGroup Attributions and + // a variadic number of Private Attributions. The number of WorkGroup + // Attributions is stored in the attr with name: + // LaunchOp::getNumWorkgroupAttributionsAttrName(). Type index = parser.getBuilder().getIndexType(); SmallVector dataTypes( LaunchOp::kNumConfigRegionAttributes, index); @@ -674,6 +757,27 @@ regionArguments.push_back(arg); } + Builder &builder = parser.getBuilder(); + // Parse workgroup memory attributions. + if (failed(parseAttributions(parser, LaunchOp::getWorkgroupKeyword(), + regionArguments))) + return failure(); + + // Store the number of operands we just parsed as the number of workgroup + // memory attributions. + unsigned numWorkgroupAttrs = + regionArguments.size() - LaunchOp::kNumConfigRegionAttributes; + result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(), + builder.getI64IntegerAttr(numWorkgroupAttrs)); + + // Parse private memory attributions. + if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(), + regionArguments))) + return failure(); + + // Introduce the body region and parse it. The region has + // kNumConfigRegionAttributes arguments that correspond to + // block/thread identifiers and grid/block sizes, all of the `index` type. Region *body = result.addRegion(); if (parser.parseRegion(*body, regionArguments) || parser.parseOptionalAttrDict(result.attributes)) @@ -729,6 +833,25 @@ rewrites.add(context); } +/// Adds a new block argument that corresponds to buffers located in +/// workgroup memory. +BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) { + auto attrName = getNumWorkgroupAttributionsAttrName(); + auto attr = (*this)->getAttrOfType(attrName); + (*this)->setAttr(attrName, + IntegerAttr::get(attr.getType(), attr.getValue() + 1)); + return getBody().insertArgument( + LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc); +} + +/// Adds a new block argument that corresponds to buffers located in +/// private memory. +BlockArgument LaunchOp::addPrivateAttribution(Type type, Location loc) { + // Buffers on the private memory always come after buffers on the workgroup + // memory. + return getBody().addArgument(type, loc); +} + //===----------------------------------------------------------------------===// // LaunchFuncOp //===----------------------------------------------------------------------===// @@ -894,24 +1017,6 @@ body->getBlocks().push_back(entryBlock); } -/// Parses a GPU function memory attribution. -/// -/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? -/// (`private` `(` ssa-id-and-type-list `)`)? -/// -/// Note that this function parses only one of the two similar parts, with the -/// keyword provided as argument. -static ParseResult -parseAttributions(OpAsmParser &parser, StringRef keyword, - SmallVectorImpl &args) { - // If we could not parse the keyword, just assume empty list and succeed. - if (failed(parser.parseOptionalKeyword(keyword))) - return success(); - - return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren, - /*allowType=*/true); -} - /// Parses a GPU function. /// /// ::= `gpu.func` symbol-ref-id `(` argument-list `)` @@ -985,17 +1090,6 @@ return parser.parseRegion(*body, entryArgs); } -static void printAttributions(OpAsmPrinter &p, StringRef keyword, - ArrayRef values) { - if (values.empty()) - return; - - p << ' ' << keyword << '('; - llvm::interleaveComma( - values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); }); - p << ')'; -} - void GPUFuncOp::print(OpAsmPrinter &p) { p << ' '; p.printSymbolName(getName()); @@ -1026,28 +1120,6 @@ return success(); } -static LogicalResult verifyAttributions(Operation *op, - ArrayRef attributions, - gpu::AddressSpace memorySpace) { - for (Value v : attributions) { - auto type = v.getType().dyn_cast(); - if (!type) - return op->emitOpError() << "expected memref type in attribution"; - - // We can only verify the address space if it hasn't already been lowered - // from the AddressSpaceAttr to a target-specific numeric value. - auto addressSpace = - type.getMemorySpace().dyn_cast_or_null(); - if (!addressSpace) - continue; - if (addressSpace.getValue() != memorySpace) - return op->emitOpError() - << "expected memory space " << stringifyAddressSpace(memorySpace) - << " in attribution"; - } - return success(); -} - /// Verifies the body of the function. LogicalResult GPUFuncOp::verifyBody() { if (empty()) diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -190,7 +190,10 @@ } FunctionType type = FunctionType::get(launchOp.getContext(), kernelOperandTypes, {}); - auto outlinedFunc = builder.create(loc, kernelFnName, type); + auto outlinedFunc = builder.create( + loc, kernelFnName, type, + TypeRange(ValueRange(launchOp.getWorkgroupAttributions())), + TypeRange(ValueRange(launchOp.getPrivateAttributions()))); outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); @@ -213,6 +216,16 @@ Region &outlinedFuncBody = outlinedFunc.getBody(); injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map); + // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. + for (const auto &[launchArg, funcArg] : + llvm::zip(launchOp.getWorkgroupAttributions(), + outlinedFunc.getWorkgroupAttributions())) + map.map(launchArg, funcArg); + for (const auto &[launchArg, funcArg] : + llvm::zip(launchOp.getPrivateAttributions(), + outlinedFunc.getPrivateAttributions())) + map.map(launchArg, funcArg); + // Map arguments from gpu.launch region to the arguments of the gpu.func // operation. Block &entryBlock = outlinedFuncBody.front(); diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -310,3 +310,105 @@ } // CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} + +// CHECK: module attributes {gpu.container_module} + +// ----- + +// CHECK-LABEL: func @launch_memory_attributions_0() +func.func @launch_memory_attributions_0() { + // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 + %0 = "op"() : () -> (f32) + // CHECK: %[[ARG1:.*]] = "op"() : () -> memref + %1 = "op"() : () -> (memref) + // CHECK: %[[GDIMX:.*]] = arith.constant 8 + %gDimX = arith.constant 8 : index + // CHECK: %[[GDIMY:.*]] = arith.constant 12 + %gDimY = arith.constant 12 : index + // CHECK: %[[GDIMZ:.*]] = arith.constant 16 + %gDimZ = arith.constant 16 : index + // CHECK: %[[BDIMX:.*]] = arith.constant 20 + %bDimX = arith.constant 20 : index + // CHECK: %[[BDIMY:.*]] = arith.constant 24 + %bDimY = arith.constant 24 : index + // CHECK: %[[BDIMZ:.*]] = arith.constant 28 + %bDimZ = arith.constant 28 : index + + // CHECK: gpu.launch_func @launch_memory_attributions_0_kernel::@launch_memory_attributions_0_kernel blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref) + // CHECK-NOT: gpu.launch blocks + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, + %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, + %block_z = %bDimZ) + workgroup(%shared: memref<42xf32, 3>) + private(%priv0: memref<2xf32, 5>, %priv1: memref<1xf32, 5>) { + "use"(%0): (f32) -> () + "some_op"(%bx, %block_x) : (index, index) -> () + %42 = memref.load %1[%tx] : memref + %43 = memref.load %shared[%tx] : memref<42xf32, 3> + %44 = memref.load %priv1[%tx] : memref<1xf32, 5> + gpu.terminator + } + return +} + +// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} + +// CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel +// CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel +// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) +// CHECK-SAME: workgroup(%[[KERNEL_ARG2:.*]] : memref<42xf32, 3>) +// CHECK-SAME: private(%[[KERNEL_ARG3:.*]] : memref<2xf32, 5>, %[[KERNEL_ARG4:.*]] : memref<1xf32, 5>) +// CHECK-SAME: gpu.known_block_size = array +// CHECK-SAME: gpu.known_grid_size = array +// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x +// CHECK-NEXT: = gpu.block_id y +// CHECK-NEXT: = gpu.block_id z +// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x +// CHECK-NEXT: = gpu.thread_id y +// CHECK-NEXT: = gpu.thread_id z +// CHECK-NEXT: = gpu.grid_dim x +// CHECK-NEXT: = gpu.grid_dim y +// CHECK-NEXT: = gpu.grid_dim z +// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x +// CHECK-NEXT: = gpu.block_dim y +// CHECK-NEXT: = gpu.block_dim z +// CHECK-NEXT: cf.br ^[[BLOCK:.*]] +// CHECK-NEXT: ^[[BLOCK]]: +// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () +// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> () +// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref +// CHECK-NEXT: = memref.load %[[KERNEL_ARG2]][%[[TID]]] : memref<42xf32, 3> +// CHECK-NEXT: = memref.load %[[KERNEL_ARG4]][%[[TID]]] : memref<1xf32, 5> + +// ----- + +// CHECK-LABEL: @launch_memory_attributions_1 +func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %d = memref.dim %arg0, %c2 : memref<*xf32> + // CHECK: gpu.func {{.*}} private(%[[KERNEL_ARG:.*]] : memref<3xf32, 5>) {{.*}} { + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[D:.*]] = memref.dim %[[ARG:.*]], %[[C2]] + // CHECK: "use1"(%[[D]]) + // CHECK: "use2"(%[[C2]], %[[C2]]) + // CHECK: "use3"(%[[ARG]]) {{.*}} + // CHECK-NEXT: = memref.load %[[KERNEL_ARG]][%[[C2]]] : memref<3xf32, 5> + // CHECK: gpu.return + // CHECK: } + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, + %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, + %block_z = %c1) + private(%priv0: memref<3xf32, 5>) { + "use1"(%d) : (index) -> () + "use2"(%c2, %c2) : (index, index) -> () + "use3"(%arg0) : (memref<*xf32>) -> () + %42 = memref.load %priv0[%c2] : memref<3xf32, 5> + gpu.terminator + } + return +} + +// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>}