diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3,7 +3,7 @@
name: matmul
cpp_op_name: MatmulOp
doc: |-
- Performs a matrix multiplacation of two 2D inputs.
+ Performs a matrix multiplication of two 2D inputs.
Numeric casting is performed on the operands to the inner multiply, promoting
them to the same data type as the accumulator/output.
@@ -305,4 +305,101 @@
operands:
- !ScalarExpression
scalar_arg: B
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+ name: mmt_4d_kernel
+ cpp_op_name: Mmt4DKernelOp
+ doc: |-
+ A lowering path for linalg.matmul towards efficient code generation on CPU.
+ In general, MLIR tensors don't have a memory layout in themselves, only
+ acquiring one at bufferization. Accordingly, linalg.matmul is a high-level
+ operation operating on abstract layout-less tensors. This op is also
+ operating on tensors, but unlike linalg.matmul, it is prescribing a specific
+ layout to be used for bufferizing its input tensors, which will be the
+ actual data layout traversed by an efficient CPU kernel. Accordingly, this
+ operator has some parameters controlling this layout, allowing it to adapt
+ to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing
+ in the shapes of the input tensors. These values will be chosen based on CPU
+ ISA features such as the shape of SIMD instructions and of SIMD registers.
+ Here is a more detailed overview of differences from linalg.matmul:
+ * The right hand side is transposed, whence the 't' in 'mmt'. In other
+ words, this op computes `accumulator + lhs * transpose(rhs)` instead of
+ `accumulator + lhs * rhs`. This transposition brings RHS on an equal
+ footing as LHS from the perspective of an efficient implementation: now
+ both are traversed row-wise by the inner accumulation loop, so we want the
+ same row-major layouts for both LHS and RHS. Without that transposition,
+ the below discussion of layouts would be complicated by having to describe
+ LHS and RHS separately, and the actual code would be accordingly more
+ complicated.
+ * The input and output tensors have a 4D shape instead of a 2D shape. They
+ are interpreted as 2D matrices with one level of 2D tile subdivision,
+ whence the 2+2=4 dimensions. The inner tile dimensions are identified with
+ '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads
+ as: MxK tiles, each of shape M0xK0.
+ * This op comes with a *recommendation* that its input tensors be bufferized
+ into a row-major layout (meaning that the last-enumerated dimension is
+ contiguous in memory), and with no inner striding (meaning no striding
+ except possibly in the outermost dimension). Because of the 4D shape
+ encoding a level of 2D tile subdivision as described above, this row-major
+ layout of the 4D tensor effectively means a tiled 2D layout, with one
+ level of tiling.
+ implements:
+ - LinalgContractionOpInterface
+structured_op: !LinalgStructuredOpConfig
+ args:
+ - !
+ name: lhs
+ usage: input
+ shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s4, s2, s5)>
+ element_type_var: LhsType
+ - !
+ name: rhs
+ usage: input
+ shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s1, s4, s3, s5)>
+ element_type_var: RhsType
+ - !
+ name: accum
+ usage: output
+ shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2, s3)>
+ element_type_var: AccumType
+ indexing_maps: !LinalgIndexingMapsConfig
+ static_indexing_maps:
+ - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d4, d2,
+ d5)>
+ - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d1, d4, d3,
+ d5)>
+ - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2,
+ d3)>
+ iterator_types:
+ - parallel
+ - parallel
+ - parallel
+ - parallel
+ - reduction
+ - reduction
+ assignments:
+ - !ScalarAssign
+ arg: accum
+ value: !ScalarExpression
+ scalar_apply:
+ fn_name: add
+ operands:
+ - !ScalarExpression
+ scalar_arg: accum
+ - !ScalarExpression
+ scalar_apply:
+ fn_name: mul
+ operands:
+ - !ScalarExpression
+ symbolic_cast:
+ type_var: AccumType
+ operands:
+ - !ScalarExpression
+ scalar_arg: lhs
+ - !ScalarExpression
+ symbolic_cast:
+ type_var: AccumType
+ operands:
+ - !ScalarExpression
+ scalar_arg: rhs
diff --git a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
--- a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
+++ b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
@@ -10,7 +10,7 @@
def matmul(A=TensorDef(T1, S.M, S.K),
B=TensorDef(T2, S.K, S.N),
C=TensorDef(U, S.M, S.N, output=True)):
- """Performs a matrix multiplacation of two 2D inputs.
+ """Performs a matrix multiplication of two 2D inputs.
Numeric casting is performed on the operands to the inner multiply, promoting
them to the same data type as the accumulator/output.
@@ -68,3 +68,48 @@
"""
implements(ContractionOpInterface)
C[None] += cast(U, A[D.m]) * cast(U, B[D.m])
+
+
+@linalg_structured_op
+def mmt_4d_kernel(lhs=TensorDef(TV.LhsType, S.M, S.K, S.M0, S.K0),
+ rhs=TensorDef(TV.RhsType, S.N, S.K, S.N0, S.K0),
+ accum=TensorDef(TV.AccumType, S.M, S.N, S.M0, S.N0,
+ output=True)):
+ """A lowering path for linalg.matmul towards efficient code generation on CPU.
+ In general, MLIR tensors don't have a memory layout in themselves, only
+ acquiring one at bufferization. Accordingly, linalg.matmul is a high-level
+ operation operating on abstract layout-less tensors. This op is also
+ operating on tensors, but unlike linalg.matmul, it is prescribing a specific
+ layout to be used for bufferizing its input tensors, which will be the
+ actual data layout traversed by an efficient CPU kernel. Accordingly, this
+ operator has some parameters controlling this layout, allowing it to adapt
+ to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing
+ in the shapes of the input tensors. These values will be chosen based on CPU
+ ISA features such as the shape of SIMD instructions and of SIMD registers.
+
+ Here is a more detailed overview of differences from linalg.matmul:
+ * The right hand side is transposed, whence the 't' in 'mmt'. In other
+ words, this op computes `accumulator + lhs * transpose(rhs)` instead of
+ `accumulator + lhs * rhs`. This transposition brings RHS on an equal
+ footing as LHS from the perspective of an efficient implementation: now
+ both are traversed row-wise by the inner accumulation loop, so we want the
+ same row-major layouts for both LHS and RHS. Without that transposition,
+ the below discussion of layouts would be complicated by having to describe
+ LHS and RHS separately, and the actual code would be accordingly more
+ complicated.
+ * The input and output tensors have a 4D shape instead of a 2D shape. They
+ are interpreted as 2D matrices with one level of 2D tile subdivision,
+ whence the 2+2=4 dimensions. The inner tile dimensions are identified with
+ '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads
+ as: MxK tiles, each of shape M0xK0.
+ * This op comes with a *recommendation* that its input tensors be bufferized
+ into a row-major layout (meaning that the last-enumerated dimension is
+ contiguous in memory), and with no inner striding (meaning no striding
+ except possibly in the outermost dimension). Because of the 4D shape
+ encoding a level of 2D tile subdivision as described above, this row-major
+ layout of the 4D tensor effectively means a tiled 2D layout, with one
+ level of tiling.
+
+ """
+ implements(ContractionOpInterface)
+ accum[D.m, D.n, D.m0, D.n0] += cast(TV.AccumType, lhs[D.m, D.k, D.m0, D.k0]) * cast(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])