diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2692,4 +2692,69 @@ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } + + // + // SME2 Intrinsics + // + + class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>], + []>; + + class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], + []>; + + class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + []>; + + class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + []>; + + class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>]>; + + class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>]>; + + // + // Multi-vector fused multiply-add/subtract + // + + def int_aarch64_sme_fmla_single_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic; + def int_aarch64_sme_fmls_single_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic; + def int_aarch64_sme_fmla_single_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic; + def int_aarch64_sme_fmls_single_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic; + + def int_aarch64_sme_fmla_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_fmls_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic; + def int_aarch64_sme_fmla_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic; + def int_aarch64_sme_fmls_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic; + + def int_aarch64_sme_fmla_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; + def int_aarch64_sme_fmls_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; + def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -596,7 +596,8 @@ MachineBasicBlock *BB) const; MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg, - MachineInstr &MI, MachineBasicBlock *BB) const; + MachineInstr &MI, MachineBasicBlock *BB, + bool HasTile) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2698,13 +2698,19 @@ MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, - MachineBasicBlock *BB) const { + MachineBasicBlock *BB, bool HasTile) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + unsigned StartIdx = 0; - MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); - MIB.addReg(BaseReg + MI.getOperand(0).getImm()); - for (unsigned I = 1; I < MI.getNumOperands(); ++I) + if (HasTile) { + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + StartIdx = 1; + } else + MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); + + for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); MI.eraseFromParent(); // The pseudo is gone now. @@ -2737,16 +2743,18 @@ uint64_t SMEMatrixType = TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; switch (SMEMatrixType) { + case (AArch64::SMEMatrixArray): + return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); case (AArch64::SMEMatrixTileB): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileH): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileS): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileD): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); case (AArch64::SMEMatrixTileQ): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); } } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -249,30 +249,30 @@ let Predicates = [HasSME2] in { defm ADD_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b0011010, MatrixOp32, ZZ_s, ZPR4b32>; defm ADD_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b0111010, MatrixOp32, ZZZZ_s, ZPR4b32>; -defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r>; -defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r>; +defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r, nxv4i32, null_frag>; +defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, null_frag>; defm ADD_VG2_2ZZ : sme2_int_sve_destructive_vector_vg2_single<"add", 0b0110000>; defm ADD_VG4_4ZZ : sme2_int_sve_destructive_vector_vg4_single<"add", 0b0110000>; defm SUB_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b0011011, MatrixOp32, ZZ_s, ZPR4b32>; defm SUB_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b0111011, MatrixOp32, ZZZZ_s, ZPR4b32>; -defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r>; -defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r>; - -defm FMLA_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32>; -defm FMLA_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32>; -defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r>; -defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r>; -defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32>; -defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32>; - -defm FMLS_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32>; -defm FMLS_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32>; -defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r>; -defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r>; -defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32>; -defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32>; +defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r, nxv4i32, null_frag>; +defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, null_frag>; + +defm FMLA_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x2>; +defm FMLA_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x4>; +defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>; +defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>; +defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>; +defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x4>; + +defm FMLS_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x2>; +defm FMLS_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x4>; +defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>; +defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>; +defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>; +defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x4>; defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r>; defm ADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"add", 0b0010, MatrixOp32, ZZZZ_s_mul_r>; @@ -446,71 +446,71 @@ defm UCLAMP_VG2_2Z2Z : sme2_int_clamp_vector_vg2_multi<"uclamp", 0b1>; defm UCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"uclamp", 0b1>; -defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16>; -defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16>; +defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>; +defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>; defm FDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010000, MatrixOp32, ZZ_h, ZPR4b16>; defm FDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110000, MatrixOp32, ZZZZ_h, ZPR4b16>; -defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b010000, MatrixOp32, ZZ_h_mul_r>; -defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b010000, MatrixOp32, ZZZZ_h_mul_r>; +defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b010000, MatrixOp32, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b010000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, null_frag>; -defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16>; -defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16>; +defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>; +defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>; defm BFDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"bfdot", 0b0010010, MatrixOp32, ZZ_h, ZPR4b16>; defm BFDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"bfdot", 0b0110010, MatrixOp32, ZZZZ_h, ZPR4b16>; -defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b010010, MatrixOp32, ZZ_h_mul_r>; -defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b010010, MatrixOp32, ZZZZ_h_mul_r>; +defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b010010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b010010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16>; +defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>; -defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16>; +defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>; -defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16>; -defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8>; -defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16>; -defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8>; +defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; +defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; defm SDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1010101, MatrixOp32, ZZ_h, ZPR4b16>; defm SDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1110101, MatrixOp32, ZZZZ_h, ZPR4b16>; -defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r>; -defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r>; +defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r, nxv8i16, null_frag>; +defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, null_frag>; defm SDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b0010100, MatrixOp32, ZZ_b, ZPR4b8>; defm SDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b0110100, MatrixOp32, ZZZZ_b, ZPR4b8>; -defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r>; -defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r>; +defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; +defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; -defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8>; -defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8>; +defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; +defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; defm SUDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"sudot", 0b0010111, MatrixOp32, ZZ_b, ZPR4b8>; defm SUDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"sudot", 0b0110111, MatrixOp32, ZZZZ_b, ZPR4b8>; -defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16>; -defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8>; +defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; -defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8>; +defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; -defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16>; -defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8>; -defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8>; -defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16>; +defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; +defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; +defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; defm UDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1010111, MatrixOp32, ZZ_h, ZPR4b16>; defm UDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1110111, MatrixOp32, ZZZZ_h, ZPR4b16>; -defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r>; -defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r>; +defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r, nxv8i16, null_frag>; +defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, null_frag>; defm UDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b0010110, MatrixOp32, ZZ_b, ZPR4b8>; defm UDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b0110110, MatrixOp32, ZZZZ_b, ZPR4b8>; -defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r>; -defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r>; +defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; +defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; -defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8>; -defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8>; +defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; +defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; defm USDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"usdot", 0b0010101, MatrixOp32, ZZ_b, ZPR4b8>; defm USDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"usdot", 0b0110101, MatrixOp32, ZZZZ_b, ZPR4b8>; -defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r>; -defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r>; +defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; +defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; -defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8>; +defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; -defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16>; -defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8>; +defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>; def SMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlall", 0b000>; defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b000>; @@ -707,13 +707,13 @@ let Predicates = [HasSME2, HasSMEI16I64] in { defm ADD_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64>; defm ADD_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b1111010, MatrixOp64, ZZZZ_d, ZPR4b64>; -defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r>; -defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r>; +defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r, nxv2i64, null_frag>; +defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, null_frag>; defm SUB_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b1011011, MatrixOp64, ZZ_d, ZPR4b64>; defm SUB_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b1111011, MatrixOp64, ZZZZ_d, ZPR4b64>; -defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r>; -defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r>; +defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r, nxv2i64, null_frag>; +defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, null_frag>; defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r>; defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r>; @@ -721,23 +721,23 @@ defm SUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"sub", 0b1011, MatrixOp64, ZZ_d_mul_r>; defm SUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"sub", 0b1011, MatrixOp64, ZZZZ_d_mul_r>; -defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h_mul_r, ZPR4b16>; -defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16>; +defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; defm SDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1010100, MatrixOp64, ZZ_h, ZPR4b16>; defm SDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1110100, MatrixOp64, ZZZZ_h, ZPR4b16>; -defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r>; -defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r>; +defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r, nxv8i16, null_frag>; +defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, null_frag>; -defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16>; +defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; -defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h_mul_r, ZPR4b16>; -defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16>; +defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; +defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; defm UDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1010110, MatrixOp64, ZZ_h, ZPR4b16>; defm UDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1110110, MatrixOp64, ZZZZ_h, ZPR4b16>; -defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r>; -defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r>; +defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r, nxv8i16, null_frag>; +defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, null_frag>; -defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16>; +defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; def SMLALL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"smlall", 0b00>; defm SMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlall", 0b00>; @@ -777,19 +777,19 @@ } let Predicates = [HasSME2, HasSMEF64F64] in { -defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mul_r, ZPR4b64>; -defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64>; -defm FMLA_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64>; -defm FMLA_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64>; -defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r>; -defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r>; - -defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64>; -defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64>; -defm FMLS_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64>; -defm FMLS_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64>; -defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r>; -defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r>; +defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x2>; +defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x4>; +defm FMLA_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x2>; +defm FMLA_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x4>; +defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>; +defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>; + +defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x2>; +defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x4>; +defm FMLS_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x2>; +defm FMLS_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x4>; +defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>; +defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>; defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r>; defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r>; @@ -824,15 +824,15 @@ defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b00>; defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>; defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r>; -defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r>; +defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b01>; defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b01>; defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>; defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r>; -defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r>; +defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>; defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>; @@ -851,15 +851,15 @@ defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b10>; defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>; defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r>; -defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r>; +defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b11>; defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b11>; defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>; defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r>; -defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r>; +defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -50,6 +50,74 @@ let usesCustomInserter = 1; } +class sme2_za_array_2op_multi_single_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + +class sme2_za_array_2op_multi_multi_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + +class sme2_za_array_2op_multi_index_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + +//===----------------------------------------------------------------------===// +// SME pattern match helpers. +//===----------------------------------------------------------------------===// + +class SME2_ZA_TwoOp_VG2_Multi_Single_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm), + (!cast(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), + zpr_ty:$Zm)>; +class SME2_ZA_TwoOp_VG4_Multi_Single_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), + vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm), + (!cast(name # _PSEUDO) $base, $offset, + (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), + zpr_ty:$Zm)>; + +class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm1, vt:$Zm2), + (!cast(name # _PSEUDO) $base, $offset, + (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), + (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; + +class SME2_ZA_TwoOp_VG4_Multi_Multi_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), + vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm1, vt:$Zm2, vt:$Zm3, vt:$Zm4), + (!cast(name # _PSEUDO) $base, $offset, + (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), + (REG_SEQUENCE ZPR4Mul4, vt:$Zm1, zsub0, vt:$Zm2, zsub1, vt:$Zm3, zsub2, vt:$Zm4, zsub3))>; + +class SME2_ZA_TwoOp_VG2_Multi_Index_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)), + (!cast(name # _PSEUDO) $base, $offset, + (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>; + +class SME2_ZA_TwoOp_VG4_Multi_Index_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), + vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)), + (!cast(name # _PSEUDO) $base, $offset, + (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3), + zpr_ty:$Zm, imm_ty:$i)>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -1253,10 +1321,38 @@ MatrixOperand matrix_ty, RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty>{ - def NAME: sme2_dot_mla_add_sub_array_vg24_single; + def NAME: sme2_dot_mla_add_sub_array_vg24_single, SMEPseudo2Instr; + + def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>; +} + +multiclass sme2_dot_mla_add_sub_array_vg2_single op, + MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, + ZPRRegOp zpr_ty, ValueType vty, SDPatternOperator intrinsic>{ + def NAME: sme2_dot_mla_add_sub_array_vg24_single, SMEPseudo2Instr; + + def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat; +} + +multiclass sme2_dot_mla_add_sub_array_vg4_single op, + MatrixOperand matrix_ty, + RegisterOperand multi_vector_ty, + ZPRRegOp zpr_ty, ValueType vty, SDPatternOperator intrinsic>{ + def NAME: sme2_dot_mla_add_sub_array_vg24_single, SMEPseudo2Instr; def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat; } //===----------------------------------------------------------------------===// @@ -1290,12 +1386,16 @@ multiclass sme2_dot_mla_add_sub_array_vg2_multi op, MatrixOperand matrix_ty, - RegisterOperand multi_vector_ty>{ - def NAME : sme2_dot_mla_add_sub_array_vg2_multi; + RegisterOperand multi_vector_ty, ValueType zpr_ty, + SDPatternOperator intrinsic> { + def NAME : sme2_dot_mla_add_sub_array_vg2_multi, SMEPseudo2Instr; - def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat; + def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; } class sme2_dot_mla_add_sub_array_vg4_multi op, @@ -1325,15 +1425,18 @@ let Constraints = "$ZAd = $_ZAd"; } - multiclass sme2_dot_mla_add_sub_array_vg4_multi op, MatrixOperand matrix_ty, - RegisterOperand multi_vector_ty>{ - def NAME : sme2_dot_mla_add_sub_array_vg4_multi; + RegisterOperand multi_vector_ty, + ValueType zpr_ty, SDPatternOperator intrinsic>{ + def NAME : sme2_dot_mla_add_sub_array_vg4_multi, SMEPseudo2Instr; - def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat; + def : InstAlias(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>; } //===----------------------------------------------------------------------===// @@ -2108,15 +2211,21 @@ // SME2 multi-vec ternary indexed two registers 32-bit multiclass sme2_multi_vec_array_vg2_index_32b op, RegisterOperand multi_vector_ty, - ZPRRegOp vector_ty> { + ZPRRegOp vector_ty, ValueType vt, + SDPatternOperator intrinsic> { def NAME : sme2_multi_vec_array_vg2_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty, - VectorIndexS, mnemonic> { + VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr { bits<2> i; let Inst{11-10} = i; } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS:$i), 0>; + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>; } // SME2.1 multi-vec ternary indexed two registers 16-bit @@ -2141,7 +2250,7 @@ string mnemonic> : I<(outs MatrixOp64:$ZAda), (ins MatrixOp64:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), mnemonic, "\t$ZAda[$Rv, $imm3, vgx2], $Zn, $Zm$i1", "", []>, Sched<[]> { bits<4> Zm; @@ -2165,13 +2274,18 @@ multiclass sme2_multi_vec_array_vg2_index_64b op, RegisterOperand multi_vector_ty, - ZPRRegOp vector_ty> { + ZPRRegOp vector_ty, ValueType vt, + SDPatternOperator intrinsic> { def NAME : sme2_multi_vec_array_vg2_index_64b; + mnemonic>, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; def : InstAlias(NAME) MatrixOp64:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), 0>; + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>; } class sme2_multi_vec_array_vg4_index op, MatrixOperand matrix_ty, @@ -2205,16 +2319,21 @@ // SME2 multi-vec ternary indexed four registers 32-bit multiclass sme2_multi_vec_array_vg4_index_32b op, RegisterOperand multi_vector_ty, - ZPRRegOp vector_ty> { + ZPRRegOp vector_ty, ValueType vt, + SDPatternOperator intrinsic> { def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, - vector_ty, VectorIndexS, mnemonic>{ + vector_ty, VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr { bits<2> i; let Inst{11-10} = i; } + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + def : InstAlias(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS:$i), 0>; + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>; } // SME2.1 multi-vec ternary indexed four registers 16-bit @@ -2239,7 +2358,7 @@ string mnemonic> : I<(outs MatrixOp64:$ZAda), (ins MatrixOp64:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), mnemonic, "\t$ZAda[$Rv, $imm3, vgx4], $Zn, $Zm$i1", "", []>, Sched<[]> { bits<4> Zm; @@ -2264,13 +2383,18 @@ multiclass sme2_multi_vec_array_vg4_index_64b op, RegisterOperand multi_vector_ty, - ZPRRegOp vector_ty> { + ZPRRegOp vector_ty, ValueType vty, + SDPatternOperator intrinsic> { def NAME : sme2_multi_vec_array_vg4_index_64b; + mnemonic>, SMEPseudo2Instr; + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; def : InstAlias(NAME) MatrixOp64:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, - multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), 0>; + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>; } //===----------------------------------------------------------------------===// // SME2 multi-vec indexed long long MLA one source 32-bit diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll @@ -0,0 +1,658 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -verify-machineinstrs | FileCheck %s + +; FMLA (SINGLE) + +define void @multi_vector_add_single_vg1x2_s(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s +; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + +define void @multi_vector_add_single_vg1x2_d(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_single_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d +; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + +define void @multi_vector_add_single_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_single_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s +; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_add_single_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_single_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d +; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + +; FMLS (SINGLE) + +define void @multi_vector_sub_single_vg1x2_s(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s +; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg1x2_d(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_single_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d +; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_single_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s +; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + +define void @multi_vector_sub_single_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_single_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d +; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm) + ret void +} + +; FMLA (MULTI) + +define void @multi_vector_add_vg1x2_s(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: multi_vector_add_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %zm1, %zm2) { + call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + +define void @multi_vector_add_vg1x2_d(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: multi_vector_add_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d } +; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d } +; CHECK-NEXT: ret + %zm1, %zm2) { + call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + +; Test to ensure the correct register class is used (first register in the list should be a multiple of 2) +define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z4.d, z3.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s } +; CHECK-NEXT: ret + %zm0, %zm1) { + call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice, + %zn1, %zn0, + %zm1, %zm0) + ret void +} + +define void @multi_vector_add_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { + call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + ret void +} + +define void @multi_vector_add_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { + call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + ret void +} + +; Test to ensure the correct register class is used (first register in the list should be a multiple of 4) +define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_vg1x4_s_regclass: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z7.d +; CHECK-NEXT: mov z30.d, z3.d +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z25.d, z6.d +; CHECK-NEXT: mov z29.d, z2.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: mov z28.d, z1.d +; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: mov z31.d, z0.d +; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z28.s - z31.s }, { z24.s - z27.s } +; CHECK-NEXT: ret + %zm0, %zm1, %zm2, %zm3) { + call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice, + %zn1, %zn2, %zn3, %zn0, + %zm1, %zm2, %zm3, %zm0) + ret void +} + +; FMLS (MULTI) + +define void @multi_vector_sub_vg1x2_s(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: multi_vector_sub_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %zm1, %zm2) { + call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + +define void @multi_vector_sub_vg1x2_d(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: multi_vector_sub_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d } +; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d } +; CHECK-NEXT: ret + %zm1, %zm2) { + call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm1, %zm2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm1, %zm2) + ret void +} + +define void @multi_vector_sub_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { + call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + ret void +} + +define void @multi_vector_sub_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { + call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, %zn2, %zn3, + %zm1, %zm2, %zm3, %zm4) + ret void +} + +; FMLA (INDEXED) + +define void @multi_vector_add_lane_vg1x2_s(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3] +; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm, i32 3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm, i32 3) + ret void +} + +define void @multi_vector_add_lane_vg1x2_d(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1] +; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm, i32 1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm, i32 1) + ret void +} + +; Test to ensure the correct register class is used (first register in the list should be a multiple of 2) +define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice, + %zn1, %zn0, + %zm, i32 3) + ret void +} + +define void @multi_vector_add_lane_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_lane_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3] +; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3] +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 3) + ret void +} + +define void @multi_vector_add_lane_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_lane_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1] +; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1] +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 1) + ret void +} + +; Test to ensure the correct register class is used (first register in the list should be a multiple of 4) +define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_add_lane_vg1x4_s_regclass: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z27.d, z0.d +; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z24.s - z27.s }, z4.s[3] +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice, + %zn1, %zn2, + %zn3, %zn0, + %zm, i32 3) + ret void +} + +; FMLS (INDEXED) + +define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3] +; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1, + %zm, i32 3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zm, i32 3) + ret void +} + +define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, %zn0, %zn1, %zm) { +; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1] +; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1, + %zm, i32 1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zm, i32 1) + ret void +} + +define void @multi_vector_sub_lane_vg1x4_s(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_lane_vg1x4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3] +; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3] +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 3) + ret void +} + +define void @multi_vector_sub_lane_vg1x4_d(i32 %slice, %zn0, %zn1, %zn2, %zn3, +; CHECK-LABEL: multi_vector_sub_lane_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1] +; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1] +; CHECK-NEXT: ret + %zm) { + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3, + %zm, i32 1) + ret void +} + +declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32, , , ) +declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32, , , ) +declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32, , , , , ) +declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32, , , , , ) + +declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32, , , ) +declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32, , , ) +declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32, , , , , ) +declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32, , , , , ) + +declare void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32, , , , ) +declare void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32, , , , ) +declare void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32, , , , , + , , , ) + +declare void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32, , , , ) +declare void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32, , , , ) +declare void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32, , , , , + , , , ) +declare void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32, , , , , + , , , ) + +declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32, , , , i32) +declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32, , , , i32) +declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32, , , , , , i32) +declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32, , , , , , i32) + +declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32, , , , i32) +declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32, , , , i32) +declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32, , , , , , i32) +declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32, , , , , , i32)