diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -3,7 +3,7 @@ name: matmul cpp_op_name: MatmulOp doc: |- - Performs a matrix multiplacation of two 2D inputs. + Performs a matrix multiplication of two 2D inputs. Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. @@ -305,4 +305,101 @@ operands: - !ScalarExpression scalar_arg: B +--- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: mmt_4d_kernel + cpp_op_name: Mmt4DKernelOp + doc: |- + A lowering path for linalg.matmul towards efficient code generation on CPU. + In general, MLIR tensors don't have a memory layout in themselves, only + acquiring one at bufferization. Accordingly, linalg.matmul is a high-level + operation operating on abstract layout-less tensors. This op is also + operating on tensors, but unlike linalg.matmul, it is prescribing a specific + layout to be used for bufferizing its input tensors, which will be the + actual data layout traversed by an efficient CPU kernel. Accordingly, this + operator has some parameters controlling this layout, allowing it to adapt + to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing + in the shapes of the input tensors. These values will be chosen based on CPU + ISA features such as the shape of SIMD instructions and of SIMD registers. + Here is a more detailed overview of differences from linalg.matmul: + * The right hand side is transposed, whence the 't' in 'mmt'. In other + words, this op computes `accumulator + lhs * transpose(rhs)` instead of + `accumulator + lhs * rhs`. This transposition brings RHS on an equal + footing as LHS from the perspective of an efficient implementation: now + both are traversed row-wise by the inner accumulation loop, so we want the + same row-major layouts for both LHS and RHS. Without that transposition, + the below discussion of layouts would be complicated by having to describe + LHS and RHS separately, and the actual code would be accordingly more + complicated. + * The input and output tensors have a 4D shape instead of a 2D shape. They + are interpreted as 2D matrices with one level of 2D tile subdivision, + whence the 2+2=4 dimensions. The inner tile dimensions are identified with + '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads + as: MxK tiles, each of shape M0xK0. + * This op comes with a *recommendation* that its input tensors be bufferized + into a row-major layout (meaning that the last-enumerated dimension is + contiguous in memory), and with no inner striding (meaning no striding + except possibly in the outermost dimension). Because of the 4D shape + encoding a level of 2D tile subdivision as described above, this row-major + layout of the 4D tensor effectively means a tiled 2D layout, with one + level of tiling. + implements: + - LinalgContractionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - ! + name: lhs + usage: input + shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s4, s2, s5)> + element_type_var: LhsType + - ! + name: rhs + usage: input + shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s1, s4, s3, s5)> + element_type_var: RhsType + - ! + name: accum + usage: output + shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2, s3)> + element_type_var: AccumType + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d4, d2, + d5)> + - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d1, d4, d3, + d5)> + - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2, + d3)> + iterator_types: + - parallel + - parallel + - parallel + - parallel + - reduction + - reduction + assignments: + - !ScalarAssign + arg: accum + value: !ScalarExpression + scalar_apply: + fn_name: add + operands: + - !ScalarExpression + scalar_arg: accum + - !ScalarExpression + scalar_apply: + fn_name: mul + operands: + - !ScalarExpression + symbolic_cast: + type_var: AccumType + operands: + - !ScalarExpression + scalar_arg: lhs + - !ScalarExpression + symbolic_cast: + type_var: AccumType + operands: + - !ScalarExpression + scalar_arg: rhs diff --git a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py --- a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py +++ b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py @@ -10,7 +10,7 @@ def matmul(A=TensorDef(T1, S.M, S.K), B=TensorDef(T2, S.K, S.N), C=TensorDef(U, S.M, S.N, output=True)): - """Performs a matrix multiplacation of two 2D inputs. + """Performs a matrix multiplication of two 2D inputs. Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. @@ -68,3 +68,48 @@ """ implements(ContractionOpInterface) C[None] += cast(U, A[D.m]) * cast(U, B[D.m]) + + +@linalg_structured_op +def mmt_4d_kernel(lhs=TensorDef(TV.LhsType, S.M, S.K, S.M0, S.K0), + rhs=TensorDef(TV.RhsType, S.N, S.K, S.N0, S.K0), + accum=TensorDef(TV.AccumType, S.M, S.N, S.M0, S.N0, + output=True)): + """A lowering path for linalg.matmul towards efficient code generation on CPU. + In general, MLIR tensors don't have a memory layout in themselves, only + acquiring one at bufferization. Accordingly, linalg.matmul is a high-level + operation operating on abstract layout-less tensors. This op is also + operating on tensors, but unlike linalg.matmul, it is prescribing a specific + layout to be used for bufferizing its input tensors, which will be the + actual data layout traversed by an efficient CPU kernel. Accordingly, this + operator has some parameters controlling this layout, allowing it to adapt + to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing + in the shapes of the input tensors. These values will be chosen based on CPU + ISA features such as the shape of SIMD instructions and of SIMD registers. + + Here is a more detailed overview of differences from linalg.matmul: + * The right hand side is transposed, whence the 't' in 'mmt'. In other + words, this op computes `accumulator + lhs * transpose(rhs)` instead of + `accumulator + lhs * rhs`. This transposition brings RHS on an equal + footing as LHS from the perspective of an efficient implementation: now + both are traversed row-wise by the inner accumulation loop, so we want the + same row-major layouts for both LHS and RHS. Without that transposition, + the below discussion of layouts would be complicated by having to describe + LHS and RHS separately, and the actual code would be accordingly more + complicated. + * The input and output tensors have a 4D shape instead of a 2D shape. They + are interpreted as 2D matrices with one level of 2D tile subdivision, + whence the 2+2=4 dimensions. The inner tile dimensions are identified with + '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads + as: MxK tiles, each of shape M0xK0. + * This op comes with a *recommendation* that its input tensors be bufferized + into a row-major layout (meaning that the last-enumerated dimension is + contiguous in memory), and with no inner striding (meaning no striding + except possibly in the outermost dimension). Because of the 4D shape + encoding a level of 2D tile subdivision as described above, this row-major + layout of the 4D tensor effectively means a tiled 2D layout, with one + level of tiling. + + """ + implements(ContractionOpInterface) + accum[D.m, D.n, D.m0, D.n0] += cast(TV.AccumType, lhs[D.m, D.k, D.m0, D.k0]) * cast(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])