diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -1137,6 +1137,81 @@ - !ScalarExpression scalar_arg: rhs --- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: batch_mmt4d + cpp_class_name: BatchMmt4DOp + doc: "Performs a batched matrix-matrix-transpose multiplication of two\nbatched-4D\ + \ (5D) inputs.\n\nBesides the outermost batch dimension has the same semantic\ + \ as\nlinalg.batch_matmul, the differences from linalg.batch_matmul in the\nnon-batch\ + \ dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the\ndescription\ + \ of lingalg.mmt4d." + implements: + - LinalgContractionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: lhs + kind: input_tensor + type_var: LhsType + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s2, s3, s4)> + - !LinalgOperandDefConfig + name: rhs + kind: input_tensor + type_var: RhsType + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s5, s2, s6, s4)> + - !LinalgOperandDefConfig + name: accum + kind: output_tensor + type_var: AccumType + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s5, s3, s6)> + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0, + d1, d3, d4, d6)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0, + d2, d3, d5, d6)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0, + d1, d2, d4, d5)> + iterator_types: + - parallel + - parallel + - parallel + - reduction + - parallel + - parallel + - reduction + assignments: + - !ScalarAssign + arg: accum + value: !ScalarExpression + scalar_fn: + kind: binary + fn_name: add + operands: + - !ScalarExpression + scalar_arg: accum + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: mul + operands: + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: AccumType + operands: + - !ScalarExpression + scalar_arg: lhs + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: AccumType + operands: + - !ScalarExpression + scalar_arg: rhs +--- !LinalgOpConfig metadata: !LinalgOpMetadata name: batch_matmul cpp_class_name: BatchMatmulOp diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -350,6 +350,27 @@ ) * TypeFn.cast_signed(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0]) +@linalg_structured_op +def batch_mmt4d( + lhs=TensorDef(TV.LhsType, Batch, S.M, S.K, S.M0, S.K0), + rhs=TensorDef(TV.RhsType, Batch, S.N, S.K, S.N0, S.K0), + accum=TensorDef(TV.AccumType, Batch, S.M, S.N, S.M0, S.N0, output=True), +): + """Performs a batched matrix-matrix-transpose multiplication of two + batched-4D (5D) inputs. + + Besides the outermost batch dimension has the same semantic as + linalg.batch_matmul, the differences from linalg.batch_matmul in the + non-batch dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the + description of lingalg.mmt4d. + """ + domain(D.b, D.m, D.n, D.k, D.m0, D.n0, D.k0) + implements(ContractionOpInterface) + accum[D.b, D.m, D.n, D.m0, D.n0] += TypeFn.cast_signed( + TV.AccumType, lhs[D.b, D.m, D.k, D.m0, D.k0] + ) * TypeFn.cast_signed(TV.AccumType, rhs[D.b, D.n, D.k, D.n0, D.k0]) + + @linalg_structured_op def batch_matmul( A=TensorDef(T1, Batch, S.M, S.K), diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1187,6 +1187,17 @@ // ----- +// CHECK-LABEL: func @batch_mmt4d +func.func @batch_mmt4d(%arg0: tensor<128x10x32x8x1xf32>, %arg1: tensor<128x80x32x4x1xf32>, %arg2: tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> { + // CHECK: %{{.+}} = linalg.batch_mmt4d + // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) + // CHECK-SAME: outs(%{{.+}} : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + %0 = linalg.batch_mmt4d ins(%arg0, %arg1 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%arg2 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + return %0: tensor<128x10x80x8x4xf32> +} + +// ----- + // CHECK-LABEL: func @add_dynamic func.func @add_dynamic(%arg0: memref, %arg1: memref, %arg2: memref) { // CHECK: linalg.add