diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -48,7 +48,7 @@ //===----------------------------------------------------------------------===// def Tensor_CastOp : Tensor_Op<"cast", [ - DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, Pure ]> { @@ -257,7 +257,7 @@ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, AttrSizedOperandSegments, - Pure, + Pure, OffsetSizeAndStrideOpInterface ]> { let summary = "extract slice operation"; @@ -364,7 +364,7 @@ // Build an ExtractSliceOp with mixed static and dynamic entries packed in // a Range vector. OpBuilder<(ins "Value":$source, "ArrayRef":$ranges, - CArg<"ArrayRef", "{}">:$attrs)>, + CArg<"ArrayRef", "{}">:$attrs)>, ]; let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -502,19 +502,19 @@ tensor at the given indices. In its most general form, the tensor of indices specifies all the coordinates - of every element to extract (i.e. COO format, without the payload). + of every element to extract (i.e. COO format, without the payload). The indices are expected to be confined to coordinate values that fit the range of the `source` tensor, otherwise the behavior is undefined. The leading dimensions of the index tensor give the result tensor its leading - dimensions. The trailing dimensions of the result tensor are obtained from - the source tensor by omitting the dimensions specified in `gather_dims` + dimensions. The trailing dimensions of the result tensor are obtained from + the source tensor by omitting the dimensions specified in `gather_dims` (rank-reducing semantics) or setting them to `1` (rank-preserving semantics) (see examples). The trailing dimension of the index tensor contains the coordinates and is expected to have its size equal to the number of dimensions being gathered. This convention allows an idiomatic specification and lowering of "gathering - multiple N-D slices from the source tensor". + multiple N-D slices from the source tensor". Note: in the examples below, we separate out the indexing part of the tensor type by a whitespace for readability purposes. @@ -522,7 +522,7 @@ Example: ```mlir - // For each 1x2 triple of coordinates in %indices, extract the + // For each 1x2 triple of coordinates in %indices, extract the // element (i.e. 0-D subset) at the coordinates triple in %source. // %out = tensor.gather %source[%indices] gather_dims([0, 1, 2]) : @@ -541,20 +541,20 @@ // slice %source[*, %indices[...]:%indices[...] + 1, *] with the indices // corresponding to the `gather_dims` attribute specified by %indices. // - %out = tensor.gather %source[%indices] gather_dims([1]) : + %out = tensor.gather %source[%indices] gather_dims([1]) : (tensor<3x4x5xf32>, tensor<6x7x 1xindex>) -> tensor<6x7x 3x1x5xf32> // Note: result type may be further rank-reduced to tensor<6x7x 3x5xf32>. ``` The dimensions specified in the gather_dims attribute are ones for which the - result tensor has size `1`. + result tensor has size `1`. I.e. if the source type is `axbxcxd` and the coordinates are [1, 3], then the shape suffix is `ax1xcx1`. Gather also allows rank-reducing semantics where the shape `ax1xcx1` can be further simplified to `axc`. - The elemental type of the indices tensor can be any integer type. + The elemental type of the indices tensor can be any integer type. In the absence of target-specific or problem specific information the default type one should use is `index`. @@ -565,50 +565,50 @@ Incorrectly setting the `unique` attribute when the coordinates are not truly unique is undefined behavior. - Only full slices are meant to be supported by this op, if one desires + Only full slices are meant to be supported by this op, if one desires partial slices (e.g. strided windows) one should compose this op with other tensor ops (e.g. tensor.extract_slice). This is to avoid a slippery slope of complexity that would make the op unusable in practice. - At the tensor-level, the index tensor is specified in an AoS form (i.e. - coordinate tuple is the most minor). It is the responsibility of further + At the tensor-level, the index tensor is specified in an AoS form (i.e. + coordinate tuple is the most minor). It is the responsibility of further lowerings and bufferiation to implement various concrete layouts. Note: As currently specified, the operation must lower to an abstraction that performs copies to the output tensor. This is because the buffer type system - is currently not rich enough to allow multiple non-contiguous views in the + is currently not rich enough to allow multiple non-contiguous views in the same type. This is visible more clearly in a notional buffer version of the op: ```mlir // memref is a contiguous buffer of ?x4x1 elements. // gather from random source slices must copy to the contiguous output. - %out = memref.gather %source[%indices] gather_dims([1]) : + %out = memref.gather %source[%indices] gather_dims([1]) : (memref<4x4xf32>, memref) -> memref - // Nested buffer support would allow gather to directly index into the + // Nested buffer support would allow gather to directly index into the // source buffer (i.e. represent a jagged view into the source). - %out = memref.gather %source[%indices] gather_dims([1]) : + %out = memref.gather %source[%indices] gather_dims([1]) : (memref<4x4xf32>, memref) -> memref> ``` }]; - let arguments = (ins AnyRankedTensor:$source, + let arguments = (ins AnyRankedTensor:$source, RankedTensorOf<[AnySignlessIntegerOrIndex]>:$indices, DenseI64ArrayAttr:$gather_dims, UnitAttr:$unique); let results = (outs AnyRankedTensor:$result); let assemblyFormat = [{ - $source `[` $indices `]` + $source `[` $indices `]` `gather_dims` `(` $gather_dims `)` - (`unique` $unique^)? + (`unique` $unique^)? attr-dict `:` functional-type(operands, results) }]; let extraClassDeclaration = [{ - // TODO: InferTypeOpInterface once enough confidence is built with + // TODO: InferTypeOpInterface once enough confidence is built with // tensor and its lwoering to memref. static RankedTensorType inferResultType(RankedTensorType sourceType, RankedTensorType indicesType, @@ -739,9 +739,9 @@ def Tensor_InsertSliceOp : Tensor_OpWithOffsetSizesAndStrides<"insert_slice", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, - AttrSizedOperandSegments, + AttrSizedOperandSegments, DestinationStyleOpInterface, - Pure, + Pure, OffsetSizeAndStrideOpInterface, TypesMatchWith<"expected result type to match dest type", "dest", "result", "$_self"> @@ -1127,7 +1127,7 @@ def Tensor_PadOp : Tensor_Op<"pad", [ DeclareOpInterfaceMethods, - AttrSizedOperandSegments, + AttrSizedOperandSegments, Pure, SingleBlockImplicitTerminator<"mlir::tensor::YieldOp">]> { let summary = "tensor pad operation"; @@ -1475,7 +1475,7 @@ DeclareOpInterfaceMethods, Pure ]> { - let summary = + let summary = "scatter a tensor into a destination tensor at specified indices"; let description = [{ The `scatter` operation inserts a `source` tensor into a `dest` tensor at @@ -1486,13 +1486,13 @@ The indices are expected to be confined to coordinate values that fit the range of the `dest` tensor, otherwise the behavior is undefined. - The leading dimensions of the index tensor must match that of the dest + The leading dimensions of the index tensor must match that of the dest tensor. The trailing dimensions of the dest tensor must match those of the - source tensor by omitting the dimensions specified in scatter_dims + source tensor by omitting the dimensions specified in scatter_dims (rank-reducing semantics) or setting them to `1` (rank-preserving semantics) - (see examples). - This convention allows an idiomatic specification and lowering of - "scattering multiple N-D slices into the dest tensor". + (see examples). + This convention allows an idiomatic specification and lowering of + "scattering multiple N-D slices into the dest tensor". The result type must match the type of the dest tensor. Note: in the examples below, we separate out the indexing part of the tensor @@ -1501,7 +1501,7 @@ Example: ```mlir - // For each 1x2 triple of coordinates in %indices, insert the + // For each 1x2 triple of coordinates in %indices, insert the // element (i.e. 0-D subset) at the coordinates triple in %dest. // %out = tensor.scatter %source into %dest[%indices] @@ -1523,19 +1523,19 @@ // indices corresponding to the scatter_dims attribute specified by // %indices. // - %out = tensor.scatter %source into %dest[%indices] scatter_dims([1]) unique : + %out = tensor.scatter %source into %dest[%indices] scatter_dims([1]) unique : (tensor<3x 4x1x6xf32>, tensor<4x5x6xf32>, tensor<3x 1xindex>) -> tensor<4x5x6xf32> ``` The dimensions specified in the scatter_dims attribute are ones for which the - source tensor has size `1`. + source tensor has size `1`. I.e. if the dest type is `axbxcxd` and the coordinates are [1, 3], then the source type suffix is `ax1xcx1`. Sactter also allows rank-reducing semantics where the shape `ax1xcx1` can be further simplified to `axc`. - The elemental type of the indices tensor can be any integer type. + The elemental type of the indices tensor can be any integer type. In the absence of target-specific or problem specific information the default type one should use is `index`. @@ -1545,18 +1545,18 @@ coordinates are statically guaranteed to be unique at runtime. If coordinates are not truly unique at runtime, the behavior is undefined. - Only full slices are meant to be supported by this op, if one desires + Only full slices are meant to be supported by this op, if one desires partial slices (e.g. strided windows) one should compose this op with other tensor ops (e.g. tensor.insert_slice). This is to avoid a slippery slope of complexity that would make the op unusable in practice. - At the tensor-level, the index tensor is specified in an AoS form (i.e. - coordinate tuple is the most minor). It is the responsibility of further + At the tensor-level, the index tensor is specified in an AoS form (i.e. + coordinate tuple is the most minor). It is the responsibility of further lowerings and bufferiation to implement various concrete layouts. Note: As currently specified, the operation must lower to an abstraction that performs copies to the output tensor. This is because the buffer type system - is currently not rich enough to allow multiple non-contiguous views in the + is currently not rich enough to allow multiple non-contiguous views in the same type. This is visible more clearly in a notional buffer version of the op: @@ -1565,26 +1565,26 @@ // random dest slices must copy to the contiguous dest. // some_side_effecting_op_writing_into %source, ...: memref<3x 4xf32> - memref.scatter %source into %dest[%indices] scatter_dims([1]) unique : + memref.scatter %source into %dest[%indices] scatter_dims([1]) unique : (memref<3x 4xf32>, memref, memref) // Nested buffer support in the producing op would allow writing directly // into the dest buffer. - %v = some_nested_buffer_view_op %dest[%indices] scatter_dims([1]) unique : + %v = some_nested_buffer_view_op %dest[%indices] scatter_dims([1]) unique : memref> some_side_effecting_op_writing_into %v, ...: memref> ``` }]; - let arguments = (ins AnyRankedTensor:$source, - AnyRankedTensor:$dest, + let arguments = (ins AnyRankedTensor:$source, + AnyRankedTensor:$dest, RankedTensorOf<[AnySignlessIntegerOrIndex]>:$indices, DenseI64ArrayAttr:$scatter_dims, UnitAttr:$unique); let results = (outs AnyRankedTensor:$result); let assemblyFormat = [{ - $source `into` $dest `[` $indices `]` + $source `into` $dest `[` $indices `]` `scatter_dims` `(` $scatter_dims `)` (`unique` $unique^)? attr-dict @@ -1673,49 +1673,49 @@ code commonExtraClassDeclaration = [{ int64_t getSourceRank() { return getSource().getType().getRank(); }; int64_t getDestRank() { return getDest().getType().getRank(); }; - RankedTensorType getSourceType() { + RankedTensorType getSourceType() { return getSource().getType().cast(); }; RankedTensorType getDestType() { return getDest().getType().cast(); }; - /// Return position for init operand. Init operand is `dest`. + /// Return position for init operand. Init operand is `dest`. std::pair getDpsInitsPositionRange() { return {1, 2}; // `dest` operand } /// Interface method for ConditionallySpeculatable. - Speculation::Speculatability getSpeculatability(); - - /// Return a mapping from positions `inner_dims_pos` to their + Speculation::Speculatability getSpeculatability(); + + /// Return a mapping from positions `inner_dims_pos` to their /// tile factors. DenseMap getDimAndTileMapping(); - + /// Return the tile sizes as OpFoldResult. SmallVector getMixedTiles(); - - /// Return the tile sizes as `int64_t`. If a tile size is dynamic - /// a sentinel `kDynamic` is introduced at that position in + + /// Return the tile sizes as `int64_t`. If a tile size is dynamic + /// a sentinel `kDynamic` is introduced at that position in /// the returned vector. SmallVector getStaticTiles(); }]; - + let hasVerifier = 1; } def Tensor_PackOp : Tensor_RelayoutOp<"pack", [ AttrSizedOperandSegments]> { let summary = "tensor pack operation"; - let description = [{ + let description = [{ The pack operation converts an input tensor to a higher-dimensional tensor with a tiled and packed layout. The mandatory `inner_dims_pos` attribute specifies a permutation for the original dimensions, while `inner_tiles` is the tiling factor for each dimension. The optional attribute `outer_dims_perm` specifies the order for the tiled data dimension, while the attribute `padding_value` specifies a padding value at the boundary on non-perfectly - divisible dimensions. Padding is optional: - - If absent, it is UB if the tile does not perfectly divide the dimension. - - If present, it will pad along high dimensions (high-padding) to make the - tile complete. + divisible dimensions. Padding is optional: + - If absent, it is UB if the tile does not perfectly divide the dimension. + - If present, it will pad along high dimensions (high-padding) to make the + tile complete. Example NC_to_NCnc: @@ -1752,23 +1752,31 @@ DenseI64ArrayAttr:$static_inner_tiles); let results = (outs AnyRankedTensor:$result); let assemblyFormat = [{ - $source + $source (`padding_value` `(` $padding_value^ `:` type($padding_value) `)`)? - (`outer_dims_perm` `=` $outer_dims_perm^)? + (`outer_dims_perm` `=` $outer_dims_perm^)? `inner_dims_pos` `=` $inner_dims_pos `inner_tiles` `=` custom($inner_tiles, $static_inner_tiles) `into` $dest attr-dict `:` type($source) `->` type($dest) }]; + let builders = [ + OpBuilder<(ins "Value":$source, "Value":$dest, + "ArrayRef":$innerDimsPos, + "ArrayRef":$innerTiles, + CArg<"Optional", "llvm::None">:$paddingValue, + CArg<"ArrayRef", "{}">:$outerDimsPerm)> + ]; + let extraClassDeclaration = commonExtraClassDeclaration # [{ // Method to get the `ShapedType` of the result based on the inner tiles, - // position of the inner tiles (innerDimsPos) and interchange vector of + // position of the inner tiles (innerDimsPos) and interchange vector of // outer loops (outerDimsPerm). static ShapedType inferPackedType(ShapedType sourceType, ArrayRef innerTileSizes, ArrayRef innerDimsPos, ArrayRef outerDimsPerm = {}); - }]; + }]; } //===----------------------------------------------------------------------===// @@ -1795,7 +1803,7 @@ Example CK to KCck: ```mlir - tensor.unapck %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + tensor.unapck %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %dest : tensor<8x16x8x32xf32> -> tensor<128x256xf32> ``` }]; diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -3238,6 +3238,26 @@ setNameFn(getResult(), "pack"); } +void PackOp::build(OpBuilder &builder, OperationState &state, Value source, + Value dest, ArrayRef innerDimsPos, + ArrayRef innerTiles, + Optional paddingValue, + ArrayRef outerDimsPerm) { + assert(innerDimsPos.size() == innerTiles.size() && + "number of tile sizes specified must match the specified number of " + "original dimensions to be tiled"); + SmallVector staticTileSizes; + SmallVector dynamicTileSizes; + dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes, + ShapedType::kDynamic); + build(builder, state, dest.getType(), source, dest, + paddingValue ? paddingValue.value() : nullptr, + outerDimsPerm.empty() ? nullptr + : builder.getDenseI64ArrayAttr(outerDimsPerm), + builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, + builder.getDenseI64ArrayAttr(staticTileSizes)); +} + LogicalResult PackOp::reifyResultShapes(OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {