diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -205,6 +205,14 @@ coordinate work items. Declarations of GPU functions, i.e. not having the body region, are not supported. + A function may optionally be annotated with the block and/or grid sizes + that will be used when it is launched using the `gpu.known_block_size` and + `gpu.known_grid_size` attributes, respectively. If set, these attributes must + be arrays of three 32-bit integers giving the x, y, and z launch dimensions. + Launching a kernel that has these annotations, or that calls a function with + these annotations, using a block size or grid size other than what is specified + is undefined behavior. + Syntax: ``` @@ -311,6 +319,36 @@ return "workgroup_attributions"; } + static constexpr StringLiteral getKnownBlockSizeAttrName() { + return StringLiteral("gpu.known_block_size"); + } + + static constexpr StringLiteral getKnownGridSizeAttrName() { + return StringLiteral("gpu.known_grid_size"); + } + + /// Returns the block size this kernel will be launched with along + /// dimension `dim` if known. The value of gpu.thread_id dim will be strictly + /// less than this size. + Optional getKnownBlockSize(gpu::Dimension dim) { + if (auto array = + (*this)->getAttrOfType(getKnownBlockSizeAttrName())) { + return array[static_cast(dim)]; + } + return std::nullopt; + } + + /// Returns the grid size this kernel will be launched with along + /// dimension `dim` if known. The value of gpu.block_id dim will be strictly + /// less than this size. + Optional getKnownGridSize(gpu::Dimension dim) { + if (auto array = + (*this)->getAttrOfType(getKnownGridSizeAttrName())) { + return array[static_cast(dim)]; + } + return std::nullopt; + } + /// Returns the argument types of this function. ArrayRef getArgumentTypes() { return getFunctionType().getInputs(); } @@ -329,6 +367,8 @@ LogicalResult verifyBody(); }]; let hasCustomAssemblyFormat = 1; + + let hasVerifier = 1; } def GPU_LaunchFuncOp : GPU_Op<"launch_func", diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/DialectImplementation.h" @@ -1056,6 +1057,27 @@ return success(); } +static LogicalResult verifyKnownLaunchSizeAttr(gpu::GPUFuncOp op, + StringRef attrName) { + auto maybeAttr = op->getAttr(attrName); + if (!maybeAttr) + return success(); + auto array = maybeAttr.dyn_cast(); + if (!array) + return op.emitOpError(attrName + " must be a dense i32 array"); + if (array.size() != 3) + return op.emitOpError(attrName + " must contain exactly 3 elements"); + return success(); +} + +LogicalResult GPUFuncOp::verify() { + if (failed(verifyKnownLaunchSizeAttr(*this, getKnownBlockSizeAttrName()))) + return failure(); + if (failed(verifyKnownLaunchSizeAttr(*this, getKnownGridSizeAttrName()))) + return failure(); + return success(); +} + //===----------------------------------------------------------------------===// // ReturnOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp --- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/IR/Matchers.h" #include "mlir/Interfaces/InferIntRangeInterface.h" +#include "llvm/Support/MathExtras.h" +#include using namespace mlir; using namespace mlir::gpu; @@ -23,40 +26,106 @@ APInt(width, umax)); } +namespace { +enum class LaunchDims : uint32_t { Block = 0, Grid = 1 }; +} // end namespace + +/// If the operation `op` is in a context that is annotated with maximum +/// launch dimensions (a launch op with constant block or grid +/// sizes or a launch_func op with the appropriate dimensions), return +/// the bound on the maximum size of the dimension that the op is querying. +/// IDs will be one less than this bound. + +static Value valueByDim(KernelDim3 dims, Dimension dim) { + switch (dim) { + case Dimension::x: + return dims.x; + case Dimension::y: + return dims.y; + case Dimension::z: + return dims.z; + } +} + +static uint64_t zext(uint32_t arg) { return static_cast(arg); } + +template +static Optional getKnownLaunchDim(Op op, LaunchDims type) { + Dimension dim = op.getDimension(); + if (auto launch = op->template getParentOfType()) { + KernelDim3 bounds; + switch (type) { + case LaunchDims::Block: + bounds = launch.getBlockSizeOperandValues(); + break; + case LaunchDims::Grid: + bounds = launch.getGridSizeOperandValues(); + break; + } + Value maybeBound = valueByDim(bounds, dim); + APInt value; + if (matchPattern(maybeBound, m_ConstantInt(&value))) + return value.getZExtValue(); + } + + if (auto func = op->template getParentOfType()) { + switch (type) { + case LaunchDims::Block: + return func.getKnownBlockSize(dim).transform(zext); + case LaunchDims::Grid: + return func.getKnownGridSize(dim).transform(zext); + } + } + return std::nullopt; +} + void BlockDimOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(1, kMaxDim)); + Optional knownVal = getKnownLaunchDim(*this, LaunchDims::Block); + if (knownVal) + setResultRange(getResult(), getIndexRange(*knownVal, *knownVal)); + else + setResultRange(getResult(), getIndexRange(1, kMaxDim)); } void BlockIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(0, kMaxDim - 1)); + uint64_t max = getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim); + setResultRange(getResult(), getIndexRange(0, max - 1ULL)); } void GridDimOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(1, kMaxDim)); + Optional knownVal = getKnownLaunchDim(*this, LaunchDims::Grid); + if (knownVal) + setResultRange(getResult(), getIndexRange(*knownVal, *knownVal)); + else + setResultRange(getResult(), getIndexRange(1, kMaxDim)); } void ThreadIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(0, kMaxDim - 1)); + uint64_t max = getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim); + setResultRange(getResult(), getIndexRange(0, max - 1ULL)); } void LaneIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1)); + setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL)); } void SubgroupIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), getIndexRange(0, kMaxDim - 1)); + setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL)); } void GlobalIdOp::inferResultRanges(ArrayRef, SetIntRangeFn setResultRange) { - setResultRange(getResult(), - getIndexRange(0, std::numeric_limits::max())); + uint64_t blockIdMax = + getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim) - 1ULL; + uint64_t gridIdMax = + getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim) - 1ULL; + setResultRange(getResult(), getIndexRange(0, blockIdMax * gridIdMax)); } void NumSubgroupsOp::inferResultRanges(ArrayRef, diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -22,10 +22,12 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/RegionUtils.h" +#include namespace mlir { #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS @@ -147,8 +149,27 @@ return success(); } +/// Return the provided KernelDim3 as an array of i32 constants if possible. +static DenseI32ArrayAttr constantDimsAttr(gpu::KernelDim3 dims) { + SmallVector constants; + MLIRContext *ctx = dims.x.getContext(); + for (Value v : {dims.x, dims.y, dims.z}) { + APInt constValue; + if (!matchPattern(v, m_ConstantInt(&constValue))) + return nullptr; + // In the event someone called for a too-large block or grid dimension, + // don't set bounds as it is likely to cause more confusing behavior. + if (constValue.ugt(std::numeric_limits::max())) + return nullptr; + constants.push_back( + constValue.getLimitedValue(std::numeric_limits::max())); + } + return DenseI32ArrayAttr::get(ctx, constants); +} + /// Outline the `gpu.launch` operation body into a kernel function. Replace /// `gpu.terminator` operations by `gpu.return` in the generated function. +/// Set block and grid size bounds if known. static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector &operands) { @@ -173,6 +194,17 @@ auto outlinedFunc = builder.create(loc, kernelFnName, type); outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); + + // If we can infer bounds on the grid and/or block sizes from the arguments + // to the launch op, propagate them to the generated kernel. This is safe + // because multiple launches with the same body are not deduplicated. + if (auto blockBounds = constantDimsAttr(launchOp.getBlockSizeOperandValues())) + outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(), + blockBounds); + if (auto gridBounds = constantDimsAttr(launchOp.getGridSizeOperandValues())) + outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(), + gridBounds); + BlockAndValueMapping map; // Map the arguments corresponding to the launch parameters like blockIdx, diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir --- a/mlir/test/Dialect/GPU/int-range-interface.mlir +++ b/mlir/test/Dialect/GPU/int-range-interface.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-int-range-inference %s | FileCheck %s +// RUN: mlir-opt -test-int-range-inference -split-input-file %s | FileCheck %s // CHECK-LABEL: func @launch_func func.func @launch_func(%arg0 : index) { @@ -41,12 +41,18 @@ %thread_id_y0 = test.reflect_bounds %thread_id_y %thread_id_z0 = test.reflect_bounds %thread_id_z + // The launch bounds are not constant, and so this can't infer anything + // CHECK: test.reflect_bounds {smax = 4294967294 : index, smin = 0 : index, umax = 4294967294 : index, umin = 0 : index} + %thread_id_op = gpu.thread_id y + %thread_id_op0 = test.reflect_bounds %thread_id_op gpu.terminator } func.return } +// ----- + // CHECK-LABEL: func @kernel module attributes {gpu.container_module} { gpu.module @gpu_module { @@ -100,9 +106,9 @@ %global_id_y = gpu.global_id y %global_id_z = gpu.global_id z - // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = 0 : index, umax = 9223372036854775807 : index, umin = 0 : index} - // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = 0 : index, umax = 9223372036854775807 : index, umin = 0 : index} - // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = 0 : index, umax = 9223372036854775807 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = -9223372036854775808 : index, umax = -17179869180 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = -9223372036854775808 : index, umax = -17179869180 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 9223372036854775807 : index, smin = -9223372036854775808 : index, umax = -17179869180 : index, umin = 0 : index} %global_id_x0 = test.reflect_bounds %global_id_x %global_id_y0 = test.reflect_bounds %global_id_y %global_id_z0 = test.reflect_bounds %global_id_z @@ -126,3 +132,86 @@ } } +// ----- + +// CHECK-LABEL: func @annotated_kernel +module attributes {gpu.container_module} { + gpu.module @gpu_module { + gpu.func @annotated_kernel() kernel + attributes {gpu.known_block_size = array, + gpu.known_grid_size = array} { + + %grid_dim_x = gpu.grid_dim x + %grid_dim_y = gpu.grid_dim y + %grid_dim_z = gpu.grid_dim z + + // CHECK: test.reflect_bounds {smax = 20 : index, smin = 20 : index, umax = 20 : index, umin = 20 : index} + // CHECK: test.reflect_bounds {smax = 24 : index, smin = 24 : index, umax = 24 : index, umin = 24 : index} + // CHECK: test.reflect_bounds {smax = 28 : index, smin = 28 : index, umax = 28 : index, umin = 28 : index} + %grid_dim_x0 = test.reflect_bounds %grid_dim_x + %grid_dim_y0 = test.reflect_bounds %grid_dim_y + %grid_dim_z0 = test.reflect_bounds %grid_dim_z + + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %block_id_z = gpu.block_id z + + // CHECK: test.reflect_bounds {smax = 19 : index, smin = 0 : index, umax = 19 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 23 : index, smin = 0 : index, umax = 23 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 27 : index, smin = 0 : index, umax = 27 : index, umin = 0 : index} + %block_id_x0 = test.reflect_bounds %block_id_x + %block_id_y0 = test.reflect_bounds %block_id_y + %block_id_z0 = test.reflect_bounds %block_id_z + + %block_dim_x = gpu.block_dim x + %block_dim_y = gpu.block_dim y + %block_dim_z = gpu.block_dim z + + // CHECK: test.reflect_bounds {smax = 8 : index, smin = 8 : index, umax = 8 : index, umin = 8 : index} + // CHECK: test.reflect_bounds {smax = 12 : index, smin = 12 : index, umax = 12 : index, umin = 12 : index} + // CHECK: test.reflect_bounds {smax = 16 : index, smin = 16 : index, umax = 16 : index, umin = 16 : index} + %block_dim_x0 = test.reflect_bounds %block_dim_x + %block_dim_y0 = test.reflect_bounds %block_dim_y + %block_dim_z0 = test.reflect_bounds %block_dim_z + + %thread_id_x = gpu.thread_id x + %thread_id_y = gpu.thread_id y + %thread_id_z = gpu.thread_id z + + // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 11 : index, smin = 0 : index, umax = 11 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 15 : index, smin = 0 : index, umax = 15 : index, umin = 0 : index} + %thread_id_x0 = test.reflect_bounds %thread_id_x + %thread_id_y0 = test.reflect_bounds %thread_id_y + %thread_id_z0 = test.reflect_bounds %thread_id_z + + %global_id_x = gpu.global_id x + %global_id_y = gpu.global_id y + %global_id_z = gpu.global_id z + + // CHECK: test.reflect_bounds {smax = 133 : index, smin = 0 : index, umax = 133 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 253 : index, smin = 0 : index, umax = 253 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 405 : index, smin = 0 : index, umax = 405 : index, umin = 0 : index} + %global_id_x0 = test.reflect_bounds %global_id_x + %global_id_y0 = test.reflect_bounds %global_id_y + %global_id_z0 = test.reflect_bounds %global_id_z + + %subgroup_size = gpu.subgroup_size : index + %lane_id = gpu.lane_id + %num_subgroups = gpu.num_subgroups : index + %subgroup_id = gpu.subgroup_id : index + + // CHECK: test.reflect_bounds {smax = 128 : index, smin = 1 : index, umax = 128 : index, umin = 1 : index} + // CHECK: test.reflect_bounds {smax = 127 : index, smin = 0 : index, umax = 127 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 4294967295 : index, smin = 1 : index, umax = 4294967295 : index, umin = 1 : index} + // CHECK: test.reflect_bounds {smax = 4294967294 : index, smin = 0 : index, umax = 4294967294 : index, umin = 0 : index} + %subgroup_size0 = test.reflect_bounds %subgroup_size + %lane_id0 = test.reflect_bounds %lane_id + %num_subgroups0 = test.reflect_bounds %num_subgroups + %subgroup_id0 = test.reflect_bounds %subgroup_id + + gpu.return + } + } +} + diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -599,3 +599,25 @@ %1 = gpu.alloc(%0) : memref<2x?x?xf32, 1> return } + +// ----- + +module attributes {gpu.container_module} { + gpu.module @kernel { + // expected-error@+1 {{'gpu.func' op gpu.known_block_size must be a dense i32 array}} + gpu.func @kernel() kernel attributes {gpu.known_block_size = 32 : i32} { + gpu.return + } + } +} + +// ----- + +module attributes {gpu.container_module} { + gpu.module @kernel { + // expected-error@+1 {{'gpu.func' op gpu.known_block_size must contain exactly 3 elements}} + gpu.func @kernel() kernel attributes {gpu.known_block_size = array} { + gpu.return + } + } +} diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -41,6 +41,8 @@ // CHECK-LABEL: gpu.module @launch_kernel // CHECK-NEXT: gpu.func @launch_kernel // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) +// CHECK-SAME: gpu.known_block_size = array +// CHECK-SAME: gpu.known_grid_size = array // CHECK-NEXT: %[[BID:.*]] = gpu.block_id x // CHECK-NEXT: = gpu.block_id y // CHECK-NEXT: = gpu.block_id z @@ -291,3 +293,20 @@ // CHECK: func @device_function() // CHECK: func @recursive_device_function() // CHECK-NOT: func @device_function + +// ----- + +// CHECK-LABEL: @non_constant_launches +func.func @non_constant_launches(%arg0 : index) { + // CHECK-NOT: gpu.known_block_size + // CHECK-NOT: gpu.known_grid_size + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %arg0, %grid_y = %arg0, + %grid_z = %arg0) + threads(%tx, %ty, %tz) in (%block_x = %arg0, %block_y = %arg0, + %block_z = %arg0) { + gpu.terminator + } + return +} + +// CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>}