diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -1133,6 +1133,81 @@
                 - !ScalarExpression
                   scalar_arg: rhs
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: batch_mmt4d
+  cpp_class_name: BatchMmt4DOp
+  doc: "Performs a batched matrix-matrix-transpose multiplication of two\nbatched-4D\
+    \ (5D) inputs.\n\nBesides the outermost batch dimension has the same semantic\
+    \ as\nlinalg.batch_matmul, the differences from linalg.batch_matmul in the\nnon-batch\
+    \ dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the\ndescription\
+    \ of lingalg.mmt4d."
+  implements:
+  - LinalgContractionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: LhsType
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s2, s3, s4)>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: RhsType
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s5, s2, s6, s4)>
+  - !LinalgOperandDefConfig
+    name: accum
+    kind: output_tensor
+    type_var: AccumType
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s5, s3, s6)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
+      d1, d3, d4, d6)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
+      d2, d3, d5, d6)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
+      d1, d2, d4, d5)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: accum
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: accum
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: AccumType
+                operands:
+                - !ScalarExpression
+                  scalar_arg: lhs
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: AccumType
+                operands:
+                - !ScalarExpression
+                  scalar_arg: rhs
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: batch_matmul
   cpp_class_name: BatchMatmulOp
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -350,6 +350,27 @@
     ) * TypeFn.cast_signed(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])
 
 
+@linalg_structured_op
+def batch_mmt4d(
+    lhs=TensorDef(TV.LhsType, Batch, S.M, S.K, S.M0, S.K0),
+    rhs=TensorDef(TV.RhsType, Batch, S.N, S.K, S.N0, S.K0),
+    accum=TensorDef(TV.AccumType, Batch, S.M, S.N, S.M0, S.N0, output=True),
+):
+    """Performs a batched matrix-matrix-transpose multiplication of two
+    batched-4D (5D) inputs.
+
+    Besides the outermost batch dimension has the same semantic as
+    linalg.batch_matmul, the differences from linalg.batch_matmul in the
+    non-batch dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the
+    description of lingalg.mmt4d.
+    """
+    domain(D.b, D.m, D.n, D.k, D.m0, D.n0, D.k0)
+    implements(ContractionOpInterface)
+    accum[D.b, D.m, D.n, D.m0, D.n0] += TypeFn.cast_signed(
+        TV.AccumType, lhs[D.b, D.m, D.k, D.m0, D.k0]
+    ) * TypeFn.cast_signed(TV.AccumType, rhs[D.b, D.n, D.k, D.n0, D.k0])
+
+
 @linalg_structured_op
 def batch_matmul(
     A=TensorDef(T1, Batch, S.M, S.K),
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1187,6 +1187,17 @@
 
 // -----
 
+// CHECK-LABEL: func @batch_mmt4d
+func.func @batch_mmt4d(%arg0: tensor<128x10x32x8x1xf32>, %arg1: tensor<128x80x32x4x1xf32>, %arg2: tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> {
+  // CHECK: %{{.+}} = linalg.batch_mmt4d
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+  %0 = linalg.batch_mmt4d ins(%arg0, %arg1 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%arg2 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+  return %0: tensor<128x10x80x8x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @add_dynamic
 func.func @add_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
   // CHECK: linalg.add