diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3,7 +3,7 @@
   name: matmul
   cpp_op_name: MatmulOp
   doc: |-
-    Performs a matrix multiplacation of two 2D inputs.
+    Performs a matrix multiplication of two 2D inputs.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
     them to the same data type as the accumulator/output.
@@ -305,4 +305,101 @@
                 operands:
                 - !ScalarExpression
                   scalar_arg: B
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: mmt_4d_kernel
+  cpp_op_name: Mmt4DKernelOp
+  doc: |-
+    A lowering path for linalg.matmul towards efficient code generation on CPU.
+    In general, MLIR tensors don't have a memory layout in themselves, only
+    acquiring one at bufferization. Accordingly, linalg.matmul is a high-level
+    operation operating on abstract layout-less tensors. This op is also
+    operating on tensors, but unlike linalg.matmul, it is prescribing a specific
+    layout to be used for bufferizing its input tensors, which will be the
+    actual data layout traversed by an efficient CPU kernel. Accordingly, this
+    operator has some parameters controlling this layout, allowing it to adapt
+    to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing
+    in the shapes of the input tensors. These values will be chosen based on CPU
+    ISA features such as the shape of SIMD instructions and of SIMD registers.
 
+    Here is a more detailed overview of differences from linalg.matmul:
+    * The right hand side is transposed, whence the 't' in 'mmt'. In other
+      words, this op computes `accumulator + lhs * transpose(rhs)` instead of
+      `accumulator + lhs * rhs`. This transposition brings RHS on an equal
+      footing as LHS from the perspective of an efficient implementation: now
+      both are traversed row-wise by the inner accumulation loop, so we want the
+      same row-major layouts for both LHS and RHS. Without that transposition,
+      the below discussion of layouts would be complicated by having to describe
+      LHS and RHS separately, and the actual code would be accordingly more
+      complicated.
+    * The input and output tensors have a 4D shape instead of a 2D shape. They
+      are interpreted as 2D matrices with one level of 2D tile subdivision,
+      whence the 2+2=4 dimensions. The inner tile dimensions are identified with
+      '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads
+      as: MxK tiles, each of shape M0xK0.
+    * This op comes with a *recommendation* that its input tensors be bufferized
+      into a row-major layout (meaning that the last-enumerated dimension is
+      contiguous in memory), and with no inner striding (meaning no striding
+      except possibly in the outermost dimension). Because of the 4D shape
+      encoding a level of 2D tile subdivision as described above, this row-major
+      layout of the 4D tensor effectively means a tiled 2D layout, with one
+      level of tiling.
+  implements:
+  - LinalgContractionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !<LinalgTensorDef>
+    name: lhs
+    usage: input
+    shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s4, s2, s5)>
+    element_type_var: LhsType
+  - !<LinalgTensorDef>
+    name: rhs
+    usage: input
+    shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s1, s4, s3, s5)>
+    element_type_var: RhsType
+  - !<LinalgTensorDef>
+    name: accum
+    usage: output
+    shape: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2, s3)>
+    element_type_var: AccumType
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d4, d2,
+      d5)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d1, d4, d3,
+      d5)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2,
+      d3)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: accum
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: accum
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: AccumType
+                operands:
+                - !ScalarExpression
+                  scalar_arg: lhs
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: AccumType
+                operands:
+                - !ScalarExpression
+                  scalar_arg: rhs
diff --git a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
--- a/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
+++ b/mlir/lib/Bindings/Python/mlir/tools/linalg_opdsl/ops/core_named_ops.py
@@ -10,7 +10,7 @@
 def matmul(A=TensorDef(T1, S.M, S.K),
            B=TensorDef(T2, S.K, S.N),
            C=TensorDef(U, S.M, S.N, output=True)):
-  """Performs a matrix multiplacation of two 2D inputs.
+  """Performs a matrix multiplication of two 2D inputs.
 
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output.
@@ -68,3 +68,48 @@
   """
   implements(ContractionOpInterface)
   C[None] += cast(U, A[D.m]) * cast(U, B[D.m])
+
+
+@linalg_structured_op
+def mmt_4d_kernel(lhs=TensorDef(TV.LhsType, S.M, S.K, S.M0, S.K0),
+                  rhs=TensorDef(TV.RhsType, S.N, S.K, S.N0, S.K0),
+                  accum=TensorDef(TV.AccumType, S.M, S.N, S.M0, S.N0,
+                                  output=True)):
+  """A lowering path for linalg.matmul towards efficient code generation on CPU.
+    In general, MLIR tensors don't have a memory layout in themselves, only
+    acquiring one at bufferization. Accordingly, linalg.matmul is a high-level
+    operation operating on abstract layout-less tensors. This op is also
+    operating on tensors, but unlike linalg.matmul, it is prescribing a specific
+    layout to be used for bufferizing its input tensors, which will be the
+    actual data layout traversed by an efficient CPU kernel. Accordingly, this
+    operator has some parameters controlling this layout, allowing it to adapt
+    to CPU ISA characteristics: these are the `M0`, `K0`, `N0` values appearing
+    in the shapes of the input tensors. These values will be chosen based on CPU
+    ISA features such as the shape of SIMD instructions and of SIMD registers.
+
+    Here is a more detailed overview of differences from linalg.matmul:
+    * The right hand side is transposed, whence the 't' in 'mmt'. In other
+      words, this op computes `accumulator + lhs * transpose(rhs)` instead of
+      `accumulator + lhs * rhs`. This transposition brings RHS on an equal
+      footing as LHS from the perspective of an efficient implementation: now
+      both are traversed row-wise by the inner accumulation loop, so we want the
+      same row-major layouts for both LHS and RHS. Without that transposition,
+      the below discussion of layouts would be complicated by having to describe
+      LHS and RHS separately, and the actual code would be accordingly more
+      complicated.
+    * The input and output tensors have a 4D shape instead of a 2D shape. They
+      are interpreted as 2D matrices with one level of 2D tile subdivision,
+      whence the 2+2=4 dimensions. The inner tile dimensions are identified with
+      '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads
+      as: MxK tiles, each of shape M0xK0.
+    * This op comes with a *recommendation* that its input tensors be bufferized
+      into a row-major layout (meaning that the last-enumerated dimension is
+      contiguous in memory), and with no inner striding (meaning no striding
+      except possibly in the outermost dimension). Because of the 4D shape
+      encoding a level of 2D tile subdivision as described above, this row-major
+      layout of the 4D tensor effectively means a tiled 2D layout, with one
+      level of tiling.
+
+  """
+  implements(ContractionOpInterface)
+  accum[D.m, D.n, D.m0, D.n0] += cast(TV.AccumType, lhs[D.m, D.k, D.m0, D.k0]) * cast(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])