diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -406,6 +406,89 @@ let hasFolder = 1; } +//===----------------------------------------------------------------------===// +// GatherOp +//===----------------------------------------------------------------------===// + +def Tensor_GatherOp : Tensor_Op<"gather", [ + NoSideEffect + ]> { + string summary = "gather a subset of a tensor at specified indices."; + string description = [{ + The `gather` operation extracts a subset of the elements from an + input tensor at the given indices. + + In its most general form, the tensor of indices specifies all the coordinates + of every element to extract (i.e. COO format). The indices are expected to be + confined to coordinate values that fit the range of the input tensor, + otherwise the behavior is undefined. + + Example: + + ``` + // For each 1x2 triple of coordinates in %indices, extract the + // element (i.e. 0-D subset) at the coordinates triple in %input. + // This corresponds to implicit COO coordinates = [0, 1, 2] + // + %out = gather %input[%indices] : + tensor<4x4x4xf32>[tensor<1x2x3xindex>] -> tensor<1x2xf32> + ``` + + A slice variant is provided that allows specifying whole slices of the input + tensor. + + Example: + + ``` + // For each 5x6 singleton of coordinates in %indices, extract the 2-D + // slice [:, 1, :] at the coordinates singleton in %input. + // + %out = gather %input[%indices] coordinates = [1] : + tensor<4x4x4xf32>[tensor<5x6x1xindex>] -> tensor<5x6x4x4xf32> + ``` + + An optional `unique` unit attribute may be specified to indicate that the + coordinates are statically guaranteed to be unique at runtime. Incorrectly + setting the `unique` attribute when the coordinates are not truly unique is + undefined behavior. + + Only full slices are meant to be supported by this op, if one desires + partial slices one should compose this op with other tensor ops. + This is to avoid a slippery slope of complexity that would make the op + unusable in practice. + + At the tensor-level, the index tensor is specified in an SoA form (i.e. + coordinate tuple is the most minor). It is the responsibility of further + lowerings and bufferiation to implement various concrete layouts. + + Note: As currently specified, the operation must lower to an abstraction that + performs copies to the output tensor. This is because the buffer type system + is currently not rich enough to allow multiple non-contiguous views in the + same type. This is visible more clearly in a notional buffer version of the op: + ``` + // memref is a contiguous buffer of ?x4 elements, gather from + // random input slices must copy to the contiguous output. + %out = gather %input[%indices] coordinates = [1] : + memref<4x4xf32>[memref] -> memref + + // Nested buffer support would allow gather to view into the input data. + %out = gather %input[%indices] coordinates = [1] : + memref<4x4xf32>[memref] -> memref> + ``` + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$indices, + OptionalAttr:$coordinates, + UnitAttr:$unique); + let results = (outs AnyRankedTensor:$result); + + let assemblyFormat = [{ + $input `[` $indices `]` () $unique + `:` type($input) `[` type($indices) `]` `->` type($result) + }]; +} + //===----------------------------------------------------------------------===// // GenerateOp //===----------------------------------------------------------------------===// @@ -1211,6 +1294,189 @@ let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// ParallelScatterOp +//===----------------------------------------------------------------------===// + +def Tensor_ParallelScatterOp : Tensor_Op<"parallel_scatter", [ + NoSideEffect + ]> { + string summary = [{ + Specify the tensor scatter update of a single thread of a parent + ParallelCombiningOpInterface op, at specified indices. + }]; + string description = [{ + The `parallel_scatter` operation yields a subset tensor value to its parent + ParallelCombiningOpInterface. These subset tensor values are aggregated to + in some unspecified order into a full tensor value returned by the parent + parallel iterating op. + The `parallel_scatter` is one such op allowed in the + ParallelCombiningOpInterface op. + + Conflicting scatter indices result in undefined semantics, in that the indices + written to by multiple parallel updates might contain data from any of the + updates, or even a malformed bit pattern. + + In the future, a combinator region or symbol may be added to allow atomic RMW + semantics. + + If an index is updated exactly once, the value contained at that index + in the resulting tensor will be equal to the value at a corresponding index + of a slice that was used for the updated. If an index is not updated at all, + its value will be equal to the one in the original tensor. + + This op does not create a new value, which allows maintaining a clean + separation between the subset and full tensor. + + Note that we cannot mark this operation as pure (NoSideEffects), even + though it has no side effects, because it will get DCEd during + canonicalization. + + In its most general form, the tensor of indices specifies all the coordinates + of every element to extract (i.e. COO format). The indices are expected to be + confined to coordinate values that fit the range of the dest tensor, + otherwise the behavior is undefined. + + Example: + + ``` + // For each 1x2 triple of coordinates in %indices, insert the + // element (i.e. 0-D subset) at the coordinates triple in %dest. + // This corresponds to implicit COO coordinates = [0, 1, 2] + // + parallel_scatter %input into %dest[%indices] : + tensor<1x2xf32> into tensor<4x4x4xf32>[tensor<1x2x3xindex>] + -> tensor<4x4x4xf32> + ``` + + A slice variant is provided that allows specifying whole tensor slices. + + Example: + + ``` + // For each 5x6 singleton of coordinates in %indices, insert the 2-D + // slice [:, 1, :] at the coordinates singleton into %dest. + // + parallel_scatter %input into %dest[%indices] coordinates = [1] : + tensor<5x6x4x4xf32> into tensor<4x4x4xf32>[tensor<5x6x1xindex>] + -> tensor<4x4x4xf32> + ``` + + Only full slices are meant to be supported by this op, if one desires + partial slices one should compose this op with other tensor ops. + This is to avoid a slippery slope of complexity that would make the op + unusable in practice. + + At the tensor-level, the index tensor is specified in an SoA form (i.e. + coordinate tuple is the most minor). It is the responsibility of further + lowerings and bufferiation to implement various concrete layouts. + + After buffer allocation, the "parallel_scatter" op is expected to lower + into a yet-undefined buffer view op that supports nested buffer types (see + the discussion about copy vs view in the documentation of GatherOp and + ScatterOp: the copy behavior is deemed prohibitive for the `parallel_scatter` + op). + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$indices, + OptionalAttr:$coordinates, + UnitAttr:$unique); + let results = (outs AnyRankedTensor:$result); + + let assemblyFormat = [{ + $input `into` $dest `[` $indices `]` () $unique + `:` type($input) `into` type($dest) `[` type($indices) `]` `->` type($result) + }]; +} + +//===----------------------------------------------------------------------===// +// ScatterOp +//===----------------------------------------------------------------------===// + +def Tensor_ScatterOp : Tensor_Op<"scatter", [ + NoSideEffect + ]> { + string summary = "scatter a subset of a tensor at specified indices."; + string description = [{ + The `scatter` operation inserts a subset of the elements from an + input tensor into a destination tensor at the given indices. + + In its most general form, the tensor of indices specifies all the coordinates + of every element to extract (i.e. COO format). The indices are expected to be + confined to coordinate values that fit the range of the dest tensor, + otherwise the behavior is undefined. + + Example: + + ``` + // For each 1x2 triple of coordinates in %indices, insert the + // element (i.e. 0-D subset) at the coordinates triple in %dest. + // This corresponds to implicit COO coordinates = [0, 1, 2] + // + %out = scatter %input into %dest[%indices] : + tensor<1x2xf32> into tensor<4x4x4xf32>[tensor<1x2x3xindex>] + -> tensor<4x4x4xf32> + ``` + + A slice variant is provided that allows specifying whole tensor slices. + + Example: + + ``` + // For each 5x6 singleton of coordinates in %indices, insert the 2-D + // slice [:, 1, :] at the coordinates singleton into %dest. + // + %out = scatter %input into %dest[%indices] coordinates = [1] : + tensor<5x6x4x4xf32> into tensor<4x4x4xf32>[tensor<5x6x1xindex>] + -> tensor<4x4x4xf32> + ``` + + An optional `unique` unit attribute may be specified to indicate that the + coordinates are statically guaranteed to be unique at runtime. Incorrectly + setting the `unique` attribute when the coordinates are not truly unique is + undefined behavior. + + Only full slices are meant to be supported by this op, if one desires + partial slices one should compose this op with other tensor ops. + This is to avoid a slippery slope of complexity that would make the op + unusable in practice. + + At the tensor-level, the index tensor is specified in an SoA form (i.e. + coordinate tuple is the most minor). It is the responsibility of further + lowerings and bufferiation to implement various concrete layouts. + + Note: As currently specified, the operation must lower to an abstraction that + performs copies to the output tensor. This is because the buffer type system + is currently not rich enough to allow multiple non-contiguous views in the + same type. This is visible more clearly in a notional buffer version of the op: + ``` + // memref is a contiguous buffer of ?x4 elements, scatter into + // random input slices must copy to the contiguous dest. + some_side_effecting_op_writing_into %input, ...: memref<4x4xf32> + scatter %input into %dest[%indices] coordinates = [1] : + memref<4x4xf32> into memref[memref] + + // Nested buffer support in the producing op would allow writing directly + // into the dest buffer. + %v = some_nested_buffer_view_op %dest[%indices] coordinates = [1] : + memref> + some_side_effecting_op_writing_into %v, ...: memref> + ``` + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$indices, + OptionalAttr:$coordinates, + UnitAttr:$unique); + let results = (outs AnyRankedTensor:$result); + + let assemblyFormat = [{ + $input `into` $dest `[` $indices `]` () $unique + `:` type($input) `into` type($dest) `[` type($indices) `]` `->` type($result) + }]; +} + //===----------------------------------------------------------------------===// // SplatOp //===----------------------------------------------------------------------===//