diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3064,6 +3064,9 @@ def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x1 : SME2_Matrix_ArrayVector_Single_Single_Intrinsic; def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic; def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic; + + def int_aarch64_sme_ # ty # instr # _ # za # _vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_ # ty # instr # _ # za # _vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic; } } } @@ -3075,6 +3078,9 @@ def int_aarch64_sme_usmla_za32_single_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic; def int_aarch64_sme_usmla_za32_single_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic; + def int_aarch64_sme_usmla_za32_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_usmla_za32_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic; + // Multi-vector signed saturating doubling multiply high def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -518,8 +518,8 @@ defm SMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"smlall", 0b0000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>; defm SMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"smlall", 0b00000, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x2>; defm SMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"smlall", 0b01000, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x4>; -defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r>; -defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r>; +defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>; +defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>; def USMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"usmlall", 0b001>; defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b100>; @@ -527,8 +527,8 @@ defm USMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"usmlall", 0b0001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>; defm USMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"usmlall", 0b00001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x2>; defm USMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"usmlall", 0b01001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x4>; -defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r>; -defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r>; +defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>; +defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>; def SMLSLL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlsll", 0b010>; defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b001>; @@ -536,8 +536,8 @@ defm SMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"smlsll", 0b0010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>; defm SMLSLL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"smlsll", 0b00010, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x2>; defm SMLSLL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"smlsll", 0b01010, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x4>; -defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r>; -defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r>; +defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>; +defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>; def UMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"umlall", 0b100>; defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b010>; @@ -545,8 +545,8 @@ defm UMLALL_MZZ_BtoS : sme2_mla_ll_array_single<"umlall", 0b0100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>; defm UMLALL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"umlall", 0b00100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x2>; defm UMLALL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"umlall", 0b01100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x4>; -defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r>; -defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r>; +defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>; +defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>; def SUMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"sumlall", 0b101>; defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b110>; @@ -560,8 +560,8 @@ defm UMLSLL_MZZ_BtoS : sme2_mla_ll_array_single<"umlsll", 0b0110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>; defm UMLSLL_VG2_M2ZZ_BtoS : sme2_mla_ll_array_vg2_single<"umlsll", 0b00110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x2>; defm UMLSLL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg4_single<"umlsll", 0b01110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x4>; -defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r>; -defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r>; +defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>; +defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>; defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>; defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>; @@ -745,8 +745,8 @@ defm SMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"smlall", 0b1000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>; defm SMLALL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"smlall", 0b10000, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x2>; defm SMLALL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"smlall", 0b11000, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x4>; -defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall", 0b1000, MatrixOp64, ZZ_h_mul_r>; -defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall", 0b1000, MatrixOp64, ZZZZ_h_mul_r>; +defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall", 0b1000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>; +defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall", 0b1000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>; def SMLSLL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"smlsll", 0b01>; defm SMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlsll", 0b01>; @@ -754,8 +754,8 @@ defm SMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"smlsll", 0b1010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>; defm SMLSLL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"smlsll", 0b10010, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x2>; defm SMLSLL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"smlsll", 0b11010, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x4>; -defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll", 0b1010, MatrixOp64, ZZ_h_mul_r>; -defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll", 0b1010, MatrixOp64, ZZZZ_h_mul_r>; +defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll", 0b1010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>; +defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll", 0b1010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>; def UMLALL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"umlall", 0b10>; defm UMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlall", 0b10>; @@ -763,8 +763,8 @@ defm UMLALL_MZZ_HtoD : sme2_mla_ll_array_single<"umlall", 0b1100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>; defm UMLALL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"umlall", 0b10100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x2>; defm UMLALL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"umlall", 0b11100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x4>; -defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall", 0b1100, MatrixOp64, ZZ_h_mul_r>; -defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall", 0b1100, MatrixOp64, ZZZZ_h_mul_r>; +defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall", 0b1100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>; +defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall", 0b1100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>; def UMLSLL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"umlsll", 0b11>; defm UMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlsll", 0b11>; @@ -772,8 +772,8 @@ defm UMLSLL_MZZ_HtoD : sme2_mla_ll_array_single<"umlsll", 0b1110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>; defm UMLSLL_VG2_M2ZZ_HtoD : sme2_mla_ll_array_vg2_single<"umlsll", 0b10110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x2>; defm UMLSLL_VG4_M4ZZ_HtoD : sme2_mla_ll_array_vg4_single<"umlsll", 0b11110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x4>; -defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll", 0b1110, MatrixOp64, ZZ_h_mul_r>; -defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll", 0b1110, MatrixOp64, ZZZZ_h_mul_r>; +defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll", 0b1110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>; +defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll", 0b1110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>; } let Predicates = [HasSME2, HasSMEF64F64] in { diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -2817,8 +2817,13 @@ multiclass sme2_mla_ll_array_vg2_multi op, MatrixOperand matrix_ty, - RegisterOperand vector_ty> { - def NAME : sme2_mla_ll_array_vg2_multi; + RegisterOperand vector_ty, + ValueType vt, SDPatternOperator intrinsic> { + def NAME : sme2_mla_ll_array_vg2_multi, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; def : InstAlias(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, vector_ty:$Zn, vector_ty:$Zm), 0>; @@ -2856,8 +2861,13 @@ multiclass sme2_mla_ll_array_vg4_multi op, MatrixOperand matrix_ty, - RegisterOperand vector_ty> { - def NAME : sme2_mla_ll_array_vg4_multi; + RegisterOperand vector_ty, + ValueType vt, SDPatternOperator intrinsic> { + def NAME : sme2_mla_ll_array_vg4_multi, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; def : InstAlias(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, vector_ty:$Zn, vector_ty:$Zm), 0>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll @@ -101,6 +101,88 @@ ret void } +; Multi x2 + +define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +; Multi x4 + +define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + ; UMLALL ; Single x1 @@ -199,6 +281,88 @@ ret void } +; Multi x2 + +define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +; Multi x4 + +define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + ; SMLSLL ; Single x1 @@ -297,6 +461,88 @@ ret void } +; Multi x2 + +define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +; Multi x4 + +define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + ; UMLSLL ; Single x1 @@ -395,6 +641,88 @@ ret void } +; Multi x2 + +define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +; Multi x4 + +define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + ; ; SUMLALL ; @@ -488,6 +816,49 @@ ret void } +; Multi x2 + +define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, %dummy, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z5.d, z4.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice, %zn0, %zn1, %zm0, %zm1) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice.4, %zn0, %zn1, %zm0, %zm1) + ret void +} + +; Multi x4 + +define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { +; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z31.d, z4.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice.4, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) + ret void +} + declare void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32, , ) declare void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32, , , , , ) @@ -496,6 +867,12 @@ declare void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32, , , ) declare void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32, , , , , ) +declare void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32, , , , ) +declare void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32, , , , , , , , ) + +declare void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32, , , , , , , , ) + declare void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32, , ) declare void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32, , , , , ) @@ -504,6 +881,12 @@ declare void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32, , , ) declare void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32, , , , , ) +declare void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32, , , , ) +declare void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32, , , , , , , , ) + +declare void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32, , , , , , , , ) + declare void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32, , ) declare void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32, , , , , ) @@ -512,6 +895,12 @@ declare void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32, , , ) declare void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32, , , , , ) +declare void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32, , , , ) +declare void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32, , , , , , , , ) + +declare void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32, , , , , , , , ) + declare void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32, , ) declare void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32, , , , , ) @@ -520,9 +909,18 @@ declare void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32, , , ) declare void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32, , , , , ) +declare void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32, , , , ) +declare void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32, , , , , , , , ) + +declare void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32, , , , , , , , ) + declare void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32, , , , , ) declare void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32, , ) declare void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32, , , ) declare void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32, , , , , ) + +declare void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32, , , , ) +declare void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32, , , , , , , , )