diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2697,6 +2697,12 @@ // SME2 Intrinsics // + class SME2_Matrix_ArrayVector_Single_Single_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>], + []>; + class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, @@ -2725,6 +2731,13 @@ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], []>; + class SME2_Matrix_ArrayVector_Single_Index_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, + LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>]>; + class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, @@ -2757,4 +2770,23 @@ def int_aarch64_sme_fmls_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + + // + // Multi-vector multiply-add/subtract long + // + + foreach ty = ["f", "s", "u"] in { + foreach instr = ["mlal", "mlsl"] in { + def int_aarch64_sme_ # ty # instr # _single_vg2x1 : SME2_Matrix_ArrayVector_Single_Single_Intrinsic; + def int_aarch64_sme_ # ty # instr # _single_vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic; + def int_aarch64_sme_ # ty # instr # _single_vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic; + + def int_aarch64_sme_ # ty # instr # _vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_ # ty # instr # _vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic; + + def int_aarch64_sme_ # ty # instr # _lane_vg2x1 : SME2_Matrix_ArrayVector_Single_Index_Intrinsic; + def int_aarch64_sme_ # ty # instr # _lane_vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; + def int_aarch64_sme_ # ty # instr # _lane_vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + } + } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -369,9 +369,9 @@ return SelectSVERegRegAddrMode(N, Scale, Base, Offset); } - template + template bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { - return SelectSMETileSlice(N, Scale, Vector, Offset); + return SelectSMETileSlice(N, MaxIdx, Vector, Offset, Scale); } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -443,8 +443,8 @@ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); - bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector, - SDValue &Offset); + bool SelectSMETileSlice(SDValue N, unsigned MaxSize, SDValue &Vector, + SDValue &Offset, unsigned Scale = 1); bool SelectAllActivePredicate(SDValue N); }; @@ -5895,8 +5895,9 @@ return TLI->isAllActivePredicate(*CurDAG, N); } -bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale, - SDValue &Base, SDValue &Offset) { +bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize, + SDValue &Base, SDValue &Offset, + unsigned Scale) { if (N.getOpcode() != ISD::ADD) { Base = N; Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); @@ -5909,13 +5910,12 @@ if (auto C = dyn_cast(RHS)) { int64_t ImmOff = C->getSExtValue(); - unsigned MaxSize = (1 << Scale) - 1; - if (ImmOff < 0 || ImmOff > MaxSize) + if ((ImmOff < 0 || ImmOff > MaxSize) || (ImmOff % Scale != 0)) return false; Base = LHS; - Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64); + Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -291,77 +291,77 @@ defm SQDMULH_VG2_2Z2Z : sme2_int_sve_destructive_vector_vg2_multi<"sqdmulh", 0b1000000>; defm SQDMULH_VG4_4Z4Z : sme2_int_sve_destructive_vector_vg4_multi<"sqdmulh", 0b1000000>; -defm FMLAL_MZZI : sme2_mla_long_array_index<"fmlal", 0b10, 0b00>; -defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal", 0b00>; -defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal", 0b00>; -defm FMLAL_MZZ : sme2_mla_long_array_single<"fmlal", 0b00, 0b00>; -defm FMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b00>; -defm FMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b00>; -defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b00>; -defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b00>; - -defm FMLSL_MZZI : sme2_mla_long_array_index<"fmlsl", 0b10, 0b01>; -defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl", 0b01>; -defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl", 0b01>; -defm FMLSL_MZZ : sme2_mla_long_array_single<"fmlsl", 0b00, 0b01>; -defm FMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b01>; -defm FMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b01>; -defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b01>; -defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b01>; - -defm BFMLAL_MZZI : sme2_mla_long_array_index<"bfmlal", 0b10, 0b10>; -defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal", 0b10>; -defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal", 0b10>; -defm BFMLAL_MZZ : sme2_mla_long_array_single<"bfmlal", 0b00, 0b10>; -defm BFMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b10>; -defm BFMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b10>; -defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b10>; -defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b10>; - -defm BFMLSL_MZZI : sme2_mla_long_array_index<"bfmlsl", 0b10, 0b11>; -defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl", 0b11>; -defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl", 0b11>; -defm BFMLSL_MZZ : sme2_mla_long_array_single<"bfmlsl", 0b00, 0b11>; -defm BFMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b11>; -defm BFMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b11>; -defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b11>; -defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b11>; - -defm SMLAL_MZZI : sme2_mla_long_array_index<"smlal", 0b11, 0b00>; -defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal", 0b00>; -defm SMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlal", 0b00>; -defm SMLAL_MZZ : sme2_mla_long_array_single<"smlal",0b01, 0b00>; -defm SMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlal", 0b00>; -defm SMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlal", 0b00>; -defm SMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlal", 0b00>; -defm SMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlal", 0b00>; - -defm SMLSL_MZZI : sme2_mla_long_array_index<"smlsl", 0b11, 0b01>; -defm SMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlsl", 0b01>; -defm SMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlsl", 0b01>; -defm SMLSL_MZZ : sme2_mla_long_array_single<"smlsl",0b01, 0b01>; -defm SMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlsl", 0b01>; -defm SMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlsl", 0b01>; -defm SMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlsl", 0b01>; -defm SMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlsl", 0b01>; - -defm UMLAL_MZZI : sme2_mla_long_array_index<"umlal", 0b11, 0b10>; -defm UMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlal", 0b10>; -defm UMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlal", 0b10>; -defm UMLAL_MZZ : sme2_mla_long_array_single<"umlal",0b01, 0b10>; -defm UMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlal", 0b10>; -defm UMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlal", 0b10>; -defm UMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlal", 0b10>; -defm UMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlal", 0b10>; - -defm UMLSL_MZZI : sme2_mla_long_array_index<"umlsl", 0b11, 0b11>; -defm UMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlsl", 0b11>; -defm UMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlsl", 0b11>; -defm UMLSL_MZZ : sme2_mla_long_array_single<"umlsl",0b01, 0b11>; -defm UMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlsl", 0b11>; -defm UMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlsl", 0b11>; -defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl", 0b11>; -defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl", 0b11>; +defm FMLAL_MZZI : sme2_mla_long_array_index<"fmlal", 0b10, 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x1>; +defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x2>; +defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x4>; +defm FMLAL_MZZ : sme2_mla_long_array_single<"fmlal", 0b00, 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x1>; +defm FMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>; +defm FMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>; +defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x2>; +defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x4>; + +defm FMLSL_MZZI : sme2_mla_long_array_index<"fmlsl", 0b10, 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x1>; +defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x2>; +defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x4>; +defm FMLSL_MZZ : sme2_mla_long_array_single<"fmlsl", 0b00, 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x1>; +defm FMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>; +defm FMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>; +defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>; +defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>; + +defm BFMLAL_MZZI : sme2_mla_long_array_index<"bfmlal", 0b10, 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x1>; +defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x2>; +defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x4>; +defm BFMLAL_MZZ : sme2_mla_long_array_single<"bfmlal", 0b00, 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x1>; +defm BFMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>; +defm BFMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>; +defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>; +defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>; + +defm BFMLSL_MZZI : sme2_mla_long_array_index<"bfmlsl", 0b10, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x1>; +defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x2>; +defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x4>; +defm BFMLSL_MZZ : sme2_mla_long_array_single<"bfmlsl", 0b00, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x1>; +defm BFMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>; +defm BFMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>; +defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>; +defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>; + +defm SMLAL_MZZI : sme2_mla_long_array_index<"smlal", 0b11, 0b00, nxv8i16, int_aarch64_sme_smlal_lane_vg2x1>; +defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal", 0b00, int_aarch64_sme_smlal_lane_vg2x2>; +defm SMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlal", 0b00, int_aarch64_sme_smlal_lane_vg2x4>; +defm SMLAL_MZZ : sme2_mla_long_array_single<"smlal",0b01, 0b00, nxv8i16, int_aarch64_sme_smlal_single_vg2x1>; +defm SMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlal", 0b00, int_aarch64_sme_smlal_single_vg2x2>; +defm SMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlal", 0b00, int_aarch64_sme_smlal_single_vg2x4>; +defm SMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlal", 0b00, int_aarch64_sme_smlal_vg2x2>; +defm SMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlal", 0b00, int_aarch64_sme_smlal_vg2x4>; + +defm SMLSL_MZZI : sme2_mla_long_array_index<"smlsl", 0b11, 0b01, nxv8i16, int_aarch64_sme_smlsl_lane_vg2x1>; +defm SMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlsl", 0b01, int_aarch64_sme_smlsl_lane_vg2x2>; +defm SMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlsl", 0b01, int_aarch64_sme_smlsl_lane_vg2x4>; +defm SMLSL_MZZ : sme2_mla_long_array_single<"smlsl",0b01, 0b01, nxv8i16, int_aarch64_sme_smlsl_single_vg2x1>; +defm SMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlsl", 0b01, int_aarch64_sme_smlsl_single_vg2x2>; +defm SMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlsl", 0b01, int_aarch64_sme_smlsl_single_vg2x4>; +defm SMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlsl", 0b01, int_aarch64_sme_smlsl_vg2x2>; +defm SMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlsl", 0b01, int_aarch64_sme_smlsl_vg2x4>; + +defm UMLAL_MZZI : sme2_mla_long_array_index<"umlal", 0b11, 0b10, nxv8i16, int_aarch64_sme_umlal_lane_vg2x1>; +defm UMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlal", 0b10, int_aarch64_sme_umlal_lane_vg2x2>; +defm UMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlal", 0b10, int_aarch64_sme_umlal_lane_vg2x4>; +defm UMLAL_MZZ : sme2_mla_long_array_single<"umlal",0b01, 0b10, nxv8i16, int_aarch64_sme_umlal_single_vg2x1>; +defm UMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlal", 0b10, int_aarch64_sme_umlal_single_vg2x2>; +defm UMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlal", 0b10, int_aarch64_sme_umlal_single_vg2x4>; +defm UMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlal", 0b10, int_aarch64_sme_umlal_vg2x2>; +defm UMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlal", 0b10, int_aarch64_sme_umlal_vg2x4>; + +defm UMLSL_MZZI : sme2_mla_long_array_index<"umlsl", 0b11, 0b11, nxv8i16, int_aarch64_sme_umlsl_lane_vg2x1>; +defm UMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlsl", 0b11, int_aarch64_sme_umlsl_lane_vg2x2>; +defm UMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlsl", 0b11, int_aarch64_sme_umlsl_lane_vg2x4>; +defm UMLSL_MZZ : sme2_mla_long_array_single<"umlsl",0b01, 0b11, nxv8i16, int_aarch64_sme_umlsl_single_vg2x1>; +defm UMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlsl", 0b11, int_aarch64_sme_umlsl_single_vg2x2>; +defm UMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlsl", 0b11, int_aarch64_sme_umlsl_single_vg2x4>; +defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x2>; +defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x4>; defm FCVT_Z2Z_StoH : sme2_cvt_vg2_single<"fcvt", 0b0000>; defm FCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"fcvtn", 0b0001>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -16,11 +16,14 @@ def imm_to_tile64 : ComplexPattern", []>; def imm_to_tile128 : ComplexPattern", []>; -def tileslice8 : ComplexPattern", []>; -def tileslice16 : ComplexPattern", []>; -def tileslice32 : ComplexPattern", []>; -def tileslice64 : ComplexPattern", []>; -def tileslice128 : ComplexPattern", []>; // nop +def tileslice8 : ComplexPattern", []>; +def tileslice16 : ComplexPattern", []>; +def tileslice32 : ComplexPattern", []>; +def tileslice64 : ComplexPattern", []>; +def tileslice128 : ComplexPattern", []>; // nop + +def tileslicerange3s2 : ComplexPattern", []>; +def tileslicerange2s2 : ComplexPattern", []>; def am_sme_indexed_b4 :ComplexPattern", [], [SDNPWantRoot]>; @@ -78,6 +81,12 @@ // SME pattern match helpers. //===----------------------------------------------------------------------===// +class SME2_ZA_TwoOp_Multi_Single_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn, vt:$Zm), + (!cast(name # _PSEUDO) $base, $offset, vt:$Zn, zpr_ty:$Zm)>; + + class SME2_ZA_TwoOp_VG2_Multi_Single_Pat : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm), @@ -104,6 +113,12 @@ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), (REG_SEQUENCE ZPR4Mul4, vt:$Zm1, zsub0, vt:$Zm2, zsub1, vt:$Zm3, zsub2, vt:$Zm4, zsub3))>; +class SME2_ZA_TwoOp_Multi_Index_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn, vt:$Zm, (i32 imm_ty:$i)), + (!cast(name # _PSEUDO) $base, $offset, vt:$Zn, zpr_ty:$Zm, (i32 imm_ty:$i))>; + + class SME2_ZA_TwoOp_VG2_Multi_Index_Pat : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)), @@ -1674,7 +1689,7 @@ RegisterOperand multi_vector_ty, string mnemonic, string vg_acronym=""> : I<(outs MatrixOp32:$ZAda), - (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm, multi_vector_ty:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), + (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm, multi_vector_ty:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), mnemonic, "\t$ZAda[$Rv, $imm" # !if(!eq(vg_acronym, ""), "", ", " # vg_acronym) # "], $Zn, $Zm$i3", "", []>, Sched<[]> { bits<4> Zm; @@ -1691,9 +1706,9 @@ let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_long_array_index op0, bits<2> op> { +multiclass sme2_mla_long_array_index op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array_index_base { + mnemonic>, SMEPseudo2Instr { bits<3> i3; bits<5> Zn; bits<3> imm; @@ -1702,6 +1717,10 @@ let Inst{9-5} = Zn; let Inst{2-0} = imm; } + + def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_Multi_Index_Pat; } class sme2_mla_long_array_vg2_index op0, bits<2> op> @@ -1718,18 +1737,26 @@ let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg2_index op> { - def _S : sme2_mla_long_array_vg2_index; +multiclass sme2_fp_mla_long_array_vg2_index op, ValueType zpr_ty, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg2_index, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>; + (!cast(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } -multiclass sme2_int_mla_long_array_vg2_index op> { - def _S : sme2_mla_long_array_vg2_index; +multiclass sme2_int_mla_long_array_vg2_index op, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg2_index, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>; + (!cast(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } class sme2_mla_long_array_vg4_index op0, bits<2> op> @@ -1746,18 +1773,26 @@ let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg4_index op> { - def _S : sme2_mla_long_array_vg4_index; +multiclass sme2_fp_mla_long_array_vg4_index op, ValueType zpr_ty, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>; + (!cast(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } -multiclass sme2_int_mla_long_array_vg4_index op> { - def _S : sme2_mla_long_array_vg4_index; +multiclass sme2_int_mla_long_array_vg4_index op, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg4_index, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>; + (!cast(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>; } class sme2_mla_long_arrayop0, bits<2> op, Operand index_ty, @@ -1782,9 +1817,9 @@ let Constraints = "$ZAda = $_ZAda"; } -multiclass sme2_mla_long_array_single op0, bits<2> op> { +multiclass sme2_mla_long_array_single op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array { + mnemonic> , SMEPseudo2Instr{ bits<4> Zm; bits<5> Zn; bits<3> imm; @@ -1793,6 +1828,10 @@ let Inst{9-5} = Zn; let Inst{2-0} = imm; } + + def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_Multi_Single_Pat; } class sme2_mla_long_array_vg24_single op0, bit vg4, bits<2> op, @@ -1810,33 +1849,49 @@ let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg2_single op> { +multiclass sme2_fp_mla_long_array_vg2_single op, ValueType zpr_ty, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array_vg24_single<0b00, 0b0, op, ZZ_h, mnemonic, - "vgx2">; + "vgx2">, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>; } -multiclass sme2_int_mla_long_array_vg2_single op> { +multiclass sme2_int_mla_long_array_vg2_single op, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array_vg24_single<0b01, 0b0, op, ZZ_h, mnemonic, - "vgx2">; + "vgx2">, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>; } -multiclass sme2_fp_mla_long_array_vg4_single op> { +multiclass sme2_fp_mla_long_array_vg4_single op, ValueType zpr_ty, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array_vg24_single<0b00, 0b1, op, ZZZZ_h, mnemonic, - "vgx4">; + "vgx4">, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>; } -multiclass sme2_int_mla_long_array_vg4_single op> { +multiclass sme2_int_mla_long_array_vg4_single op, SDPatternOperator intrinsic> { def _S : sme2_mla_long_array_vg24_single<0b01, 0b1, op, ZZZZ_h, mnemonic, - "vgx4">; + "vgx4">, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>; @@ -1855,15 +1910,23 @@ let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg2_multi op> { - def _S : sme2_mla_long_array_vg2_multi; +multiclass sme2_fp_mla_long_array_vg2_multi op, ValueType zpr_ty, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg2_multi, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>; } -multiclass sme2_int_mla_long_array_vg2_multi op> { - def _S : sme2_mla_long_array_vg2_multi; +multiclass sme2_int_mla_long_array_vg2_multi op, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg2_multi, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>; @@ -1884,15 +1947,23 @@ let Inst{1-0} = imm; } -multiclass sme2_fp_mla_long_array_vg4_multi op> { - def _S : sme2_mla_long_array_vg4_multi; +multiclass sme2_fp_mla_long_array_vg4_multi op, ValueType zpr_ty, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg4_multi, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>; } -multiclass sme2_int_mla_long_array_vg4_multi op> { - def _S : sme2_mla_long_array_vg4_multi; +multiclass sme2_int_mla_long_array_vg4_multi op, SDPatternOperator intrinsic> { + def _S : sme2_mla_long_array_vg4_multi, SMEPseudo2Instr; + + def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; def : InstAlias(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll @@ -0,0 +1,1322 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -verify-machineinstrs < %s | FileCheck %s + +; +; BF/F/S/UMLAL x1 (SINGLE) +; + +define void @multi_vector_add_single_vg2x1_bf16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x1_f16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x1_s16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x1_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x1_u16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x1_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm) + ret void +} + +; +; BF/F/S/UMLSL x1 (SINGLE) +; + +define void @multi_vector_sub_single_vg2x1_bf16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x1_f16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x1_s16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x1_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x1_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h +; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice, %zn, %zm) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm) + ret void +} + +; +; BF/F/S/UMLAL x2 (SINGLE) +; + +define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_add_single_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +; +; BF/F/S/UMLSL x2 (SINGLE) +; + +define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, %zm) + ret void +} + +; +; BF/F/S/UMLAL x4 (SINGLE) +; + +define void @multi_vector_add_single_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_add_single_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn2, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn2, + %zm) + ret void +} + +define void @multi_vector_add_single_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_add_single_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +; +; BF/F/S/UMLSL x4 (SINGLE) +; + +define void @multi_vector_sub_single_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm) + ret void +} + +; +; BF/F/S/UMLAL x2 (MULTI) +; + +define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +; +; BF/F/S/UMLSL x2 (MULTI) +; + +define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm0, %zm1) { +; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice, %zn0, %zn1, + %zm0, %zm1) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice.6, %zn0, %zn1, + %zm0, %zm1) + ret void +} + +; +; BF/F/S/UMLAL x4 (MULTI) +; + +define void @multi_vector_add_multi_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_multi_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_add_multi_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_multi_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_add_multi_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_multi_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_add_multi_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_multi_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +; +; BF/F/S/UMLSL x4 (MULTI) +; + +define void @multi_vector_sub_multi_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_multi_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_sub_multi_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_multi_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_sub_multi_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_multi_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +define void @multi_vector_sub_multi_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_multi_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm0, %zm1, %zm2, %zm3) + ret void +} + +; +; BF/F/S/UMLAL x1 (INDEXED) +; + +define void @multi_vector_add_lane_vg2x1_f16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x1_bf16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x1_s16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x1_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x1_u16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x1_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +; +; BF/F/S/UMLSL x1 (INDEXED) +; + +define void @multi_vector_sub_lane_vg2x1_f16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x1_bf16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x1_s16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x1_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, %zn, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x1_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h[0] +; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice, %zn, %zm, i32 0) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice.14, %zn, %zm, i32 7) + ret void +} + +; +; BF/F/S/UMLAL x2 (INDEXED) +; + +define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +; +; BF/F/S/UMLSL x2 (INDEXED) +; + +define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0] +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice, + %zn0, %zn1, %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zm, i32 7) + ret void +} + +; +; BF/F/S/UMLAL x4 (INDEXED) +; + +define void @multi_vector_add_lane_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_add_lane_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +; +; BF/F/S/UMLSL x4 (INDEXED) +; + +define void @multi_vector_sub_lane_vg2x4_f16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x4_bf16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x4_s16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x4_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +define void @multi_vector_sub_lane_vg2x4_u16(i32 %slice, %zn0, %zn1, %zn2, %zn3, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg2x4_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0] +; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 0) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice.6, + %zn0, %zn1, %zn2, %zn3, + %zm, i32 7) + ret void +} + +declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32, , ) +declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32, , ) +declare void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32, , ) +declare void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32, , ) + +declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32, , ) +declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32, , ) +declare void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32, , ) +declare void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32, , ) + +declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32, , , ) +declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32, , , ) +declare void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32, , , ) +declare void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32, , , ) + +declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32, , , ) +declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32, , , ) +declare void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32, , , ) +declare void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32, , , ) + +declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32, , , + , , ) +declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32, , , + , , ) +declare void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32, , , + , , ) +declare void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32, , , + , , ) + +declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32, , , + , , ) +declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32, , , + , , ) +declare void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32, , , + , , ) +declare void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32, , , + , , ) + +declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32, , , , ) +declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32, , , , ) +declare void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32, , , , ) + +declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32, , , , ) +declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32, , , , ) +declare void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32, , , , ) + +declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32, , , , , + , , , ) + +declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32, , , , , + , , , ) + +declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32, , , i32) +declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32, , , i32) +declare void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32, , , i32) +declare void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32, , , i32) + +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32, , , i32) +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32, , , i32) +declare void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32, , , i32) +declare void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32, , , i32) + +declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32, , , , i32) +declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32, , , , i32) +declare void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32, , , , i32) +declare void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32, , , , i32) + +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32, , , , i32) +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32, , , , i32) +declare void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32, , , , i32) +declare void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32, , , , i32) + +declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32, , , , , , i32) + +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32, , , , , , i32) +declare void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32, , , , , , i32)