diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2952,4 +2952,14 @@ def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic; + + // + // Multi-vector add/sub and accumulate into ZA + // + foreach intr = ["add", "sub"] in { + foreach za = ["za32", "za64"] in { + def int_aarch64_sme_ # intr # _ # za # _vg1x2 : SME2_ZA_Write_VG2_Intrinsic; + def int_aarch64_sme_ # intr # _ # za # _vg1x4 : SME2_ZA_Write_VG4_Intrinsic; + } + } } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -274,17 +274,17 @@ defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>; defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x4>; -defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r>; -defm ADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"add", 0b0010, MatrixOp32, ZZZZ_s_mul_r>; +defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_za32_vg1x2>; +defm ADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"add", 0b0010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_za32_vg1x4>; -defm SUB_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"sub", 0b0011, MatrixOp32, ZZ_s_mul_r>; -defm SUB_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"sub", 0b0011, MatrixOp32, ZZZZ_s_mul_r>; +defm SUB_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"sub", 0b0011, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_za32_vg1x2>; +defm SUB_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"sub", 0b0011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_za32_vg1x4>; -defm FADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0000, MatrixOp32, ZZ_s_mul_r>; -defm FADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0000, MatrixOp32, ZZZZ_s_mul_r>; +defm FADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_add_za32_vg1x2>; +defm FADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_add_za32_vg1x4>; -defm FSUB_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0001, MatrixOp32, ZZ_s_mul_r>; -defm FSUB_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0001, MatrixOp32, ZZZZ_s_mul_r>; +defm FSUB_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_sub_za32_vg1x2>; +defm FSUB_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_sub_za32_vg1x4>; defm SQDMULH_VG2_2ZZ : sme2_int_sve_destructive_vector_vg2_single<"sqdmulh", 0b1000000>; defm SQDMULH_VG4_4ZZ : sme2_int_sve_destructive_vector_vg4_single<"sqdmulh", 0b1000000>; @@ -715,11 +715,11 @@ defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x2>; defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x4>; -defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r>; -defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r>; +defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x2>; +defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x4>; -defm SUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"sub", 0b1011, MatrixOp64, ZZ_d_mul_r>; -defm SUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"sub", 0b1011, MatrixOp64, ZZZZ_d_mul_r>; +defm SUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"sub", 0b1011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_za64_vg1x2>; +defm SUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"sub", 0b1011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_za64_vg1x4>; defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>; @@ -791,11 +791,11 @@ defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>; defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>; -defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r>; -defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r>; +defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x2>; +defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x4>; -defm FSUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fsub", 0b1001, MatrixOp64, ZZ_d_mul_r>; -defm FSUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fsub", 0b1001, MatrixOp64, ZZZZ_d_mul_r>; +defm FSUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fsub", 0b1001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_sub_za64_vg1x2>; +defm FSUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fsub", 0b1001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_sub_za64_vg1x4>; } let Predicates = [HasSME2p1] in { @@ -815,10 +815,10 @@ } let Predicates = [HasSME2p1, HasSMEF16F16] in { -defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r>; -defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r>; -defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r>; -defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r>; +defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; +defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00>; defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b00>; @@ -842,10 +842,10 @@ } let Predicates = [HasSME2p1, HasB16B16] in { -defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r>; -defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r>; -defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r>; -defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r>; +defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b10>; defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b10>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -77,6 +77,13 @@ let usesCustomInserter = 1; } +class sme2_move_to_za_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -137,6 +144,14 @@ : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4)), (!cast(name) (REG_SEQUENCE ZPR4Mul4, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1, in_vt:$Zn3, zsub2, in_vt:$Zn4, zsub3))>; +class SME2_ZA_VG1x2_Multi_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2), + (!cast(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1))>; + +class SME2_ZA_VG1x4_Multi_Pat + : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (!cast(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -1510,11 +1525,16 @@ multiclass sme2_multivec_accum_add_sub_vg2 op, MatrixOperand matrix_ty, - RegisterOperand vector_ty> { - def NAME : sme2_multivec_accum_add_sub_vg2; - + RegisterOperand vector_ty, + ValueType vty, + SDPatternOperator intrinsic> { + def NAME : sme2_multivec_accum_add_sub_vg2, + SMEPseudo2Instr; def : InstAlias(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>; + + def _PSEUDO : sme2_move_to_za_pseudo; + def : SME2_ZA_VG1x2_Multi_Pat; } class sme2_multivec_accum_add_sub_vg4 op, @@ -1528,11 +1548,16 @@ multiclass sme2_multivec_accum_add_sub_vg4 op, MatrixOperand matrix_ty, - RegisterOperand vector_ty> { - def NAME : sme2_multivec_accum_add_sub_vg4; - + RegisterOperand vector_ty, + ValueType vty, + SDPatternOperator intrinsic> { + def NAME : sme2_multivec_accum_add_sub_vg4, + SMEPseudo2Instr; def : InstAlias(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>; + + def _PSEUDO : sme2_move_to_za_pseudo; + def : SME2_ZA_VG1x4_Multi_Pat; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll @@ -216,6 +216,160 @@ ret void } +; +; ADD and accumulate into ZA +; +; x2 +define void @multi_vector_add_za_vg1x2_i32(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_add_za_vg1x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s } +; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_i64(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_add_za_vg1x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f32(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_add_za_vg1x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fadd za.s[w8, 0, vgx2], { z0.s, z1.s } +; CHECK-NEXT: fadd za.s[w8, 7, vgx2], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1) + ret void +} + +define void @multi_vector_add_za_vg1x2_f64(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_add_za_vg1x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: fadd za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1) + ret void +} + +; x4 + +define void @multi_vector_add_za_vg1x4_i32(i32 %slice, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_add_za_vg1x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s } +; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_i64(i32 %slice, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_add_za_vg1x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f32(i32 %slice, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_add_za_vg1x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z0.s - z3.s } +; CHECK-NEXT: fadd za.s[w8, 7, vgx4], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_add_za_vg1x4_f64(i32 %slice, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_add_za_vg1x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fadd za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: fadd za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, , , ) declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, , , ) declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, , , , , ) @@ -224,3 +378,11 @@ declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, , , , ) declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, , , , , , , , ) declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, , , , , , , , ) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, ,) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, ,) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, ,,,) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, ,,, ) +declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, , ) +declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, , ) +declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, , ,, ) +declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, , ,, ) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll @@ -216,6 +216,170 @@ ret void } + +; +; SUB and accumulate into ZA +; + +; x2 +define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_sub_za_vg1x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s } +; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_sub_za_vg1x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_sub_za_vg1x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fsub za.s[w8, 0, vgx2], { z0.s, z1.s } +; CHECK-NEXT: fsub za.s[w8, 7, vgx2], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 %slice.7, + %zn0, %zn1) + ret void +} + +define void @multi_vector_sub_za_vg1x2_f64(i32 %slice, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_sub_za_vg1x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fsub za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: fsub za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 %slice, + %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 %slice.7, + %zn0, %zn1) + ret void +} + +; x4 + +define void @multi_vector_sub_za_vg1x4_i32(i32 %slice, +; CHECK-LABEL: multi_vector_sub_za_vg1x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s } +; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s } +; CHECK-NEXT: ret + %zn0, %zn1, + %zn2, %zn3) { + call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_sub_za_vg1x4_i64(i32 %slice, +; CHECK-LABEL: multi_vector_sub_za_vg1x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + %zn0, %zn1, + %zn2, %zn3) { + call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_sub_za_vg1x4_f32(i32 %slice, +; CHECK-LABEL: multi_vector_sub_za_vg1x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fsub za.s[w8, 0, vgx4], { z0.s - z3.s } +; CHECK-NEXT: fsub za.s[w8, 7, vgx4], { z0.s - z3.s } +; CHECK-NEXT: ret + %zn0, %zn1, + %zn2, %zn3) { + call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + +define void @multi_vector_sub_za_vg1x4_f64(i32 %slice, +; CHECK-LABEL: multi_vector_sub_za_vg1x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fsub za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: fsub za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + %zn0, %zn1, + %zn2, %zn3) { + call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 %slice, + %zn0, %zn1, + %zn2, %zn3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 %slice.7, + %zn0, %zn1, + %zn2, %zn3) + ret void +} + declare void@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32, , , ) declare void@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32, , , ) declare void@llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32, , , , @@ -226,3 +390,11 @@ declare void@llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32, , , , ) declare void@llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32, , , , , , , , ) declare void@llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32, , , , , , , , ) +declare void@llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32, ,) +declare void@llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32, ,) +declare void@llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32, ,,,) +declare void@llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32, ,,, ) +declare void@llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32, , ) +declare void@llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32, , ) +declare void@llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32, , ,, ) +declare void@llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32, , ,, )