diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2976,6 +2976,18 @@ def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + // + // Outer product and accumulate/subtract intrinsics + // + + def int_aarch64_sme_smopa_za32 : SME_OuterProduct_Intrinsic; + def int_aarch64_sme_umopa_za32 : SME_OuterProduct_Intrinsic; + def int_aarch64_sme_smops_za32 : SME_OuterProduct_Intrinsic; + def int_aarch64_sme_umops_za32 : SME_OuterProduct_Intrinsic; + + def int_aarch64_sme_bmopa_za32 : SME_OuterProduct_Intrinsic; + def int_aarch64_sme_bmops_za32 : SME_OuterProduct_Intrinsic; + // Multi-vector saturating rounding shift right intrinsics def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -563,14 +563,14 @@ defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r>; defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r>; -defm BMOPA_MPPZZ_S : sme2_bfp_mopx_tile<"bmopa", 0b100>; -defm BMOPS_MPPZZ_S : sme2_bfp_mopx_tile<"bmops", 0b101>; +defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>; +defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>; -defm SMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"smopa", 0b000>; -defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001>; +defm SMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"smopa", 0b000, int_aarch64_sme_smopa_za32>; +defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops_za32>; -defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100>; -defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101>; +defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>; +defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>; def ZERO_T : sme2_zero_zt<"zero", 0b0001>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -182,6 +182,14 @@ : Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), (!cast(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>; +//===----------------------------------------------------------------------===// +// SME pattern match helpers. +//===----------------------------------------------------------------------===// + +class SME_ZA_Tile_TwoPred_TwoVec_Pat + : Pat<(intrinsic imm_ty:$tile, (pg_ty PPR3bAny:$Pn), (pg_ty PPR3bAny:$Pm), vt:$Zn, vt:$Zm), + (!cast(name # _PSEUDO) $tile, $Pn, $Pm, $Zn, $Zm)>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -220,9 +228,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), - (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme_outer_product_fp64 { @@ -233,9 +239,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), - (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_7:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme2p1_fmop_tile_fp16{ @@ -284,9 +288,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm), - (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme_int_outer_product_i64 opc, string mnemonic, @@ -299,9 +301,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), - (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_7:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } class sme_outer_product_widening_inst opc, ZPRRegOp zpr_ty, string mnemonic> @@ -336,9 +336,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), - (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } multiclass sme_f16_outer_product opc, string mnemonic, SDPatternOperator op> { @@ -346,9 +344,7 @@ def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; - def : Pat<(op timm32_0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), - (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)), - (!cast(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>; + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } //===----------------------------------------------------------------------===// @@ -2839,16 +2835,24 @@ //===----------------------------------------------------------------------===// // SME2 Outer Product and Accumulate -multiclass sme2_int_mopx_tile op> { - def NAME : sme_int_outer_product_inst { +multiclass sme2_int_mopx_tile op, SDPatternOperator intrinsic> { + def NAME : sme_int_outer_product_inst, SMEPseudo2Instr { bits<2> ZAda; let Inst{1-0} = ZAda; let Inst{2} = 0b0; } + + def _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } -multiclass sme2_bfp_mopx_tile op> { - def NAME : sme_outer_product_widening_inst; +multiclass sme2_int_bmopx_tile op, SDPatternOperator intrinsic> { + def NAME : sme_outer_product_widening_inst, SMEPseudo2Instr; + + def _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } //===----------------------------------------------------------------------===/// diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll @@ -4,36 +4,36 @@ define void @bfmopa( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: bfmopa: ; CHECK: // %bb.0: -; CHECK-NEXT: bfmopa za0.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: bfmopa za3.s, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, %pn, %pm, %zn, %zm) ret void } define void @fmopa( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: fmopa: ; CHECK: // %bb.0: -; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, %pn, %pm, %zn, %zm) ret void } define void @smopa_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: smopa_s: ; CHECK: // %bb.0: -; CHECK-NEXT: smopa za2.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: smopa za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } define void @smopa_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: smopa_d: ; CHECK: // %bb.0: -; CHECK-NEXT: smopa za0.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: smopa za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } @@ -49,54 +49,54 @@ define void @umopa_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: umopa_d: ; CHECK: // %bb.0: -; CHECK-NEXT: umopa za1.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: umopa za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } define void @fmopa_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: fmopa_s: ; CHECK: // %bb.0: -; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s +; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, %pn, %pm, %zn, %zm) ret void } define void @fmopa_d( %pn, %pm, %zn, %zm) #1 { ; CHECK-LABEL: fmopa_d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmopa za2.d, p0/m, p1/m, z0.d, z1.d +; CHECK-NEXT: fmopa za7.d, p0/m, p1/m, z0.d, z1.d ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mopa.nxv2f64(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, %pn, %pm, %zn, %zm) ret void } define void @sumopa_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: sumopa_s: ; CHECK: // %bb.0: -; CHECK-NEXT: sumopa za1.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: sumopa za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } define void @sumopa_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: sumopa_d: ; CHECK: // %bb.0: -; CHECK-NEXT: sumopa za3.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: sumopa za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } define void @usmopa_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: usmopa_s: ; CHECK: // %bb.0: -; CHECK-NEXT: usmopa za2.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: usmopa za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll @@ -4,36 +4,36 @@ define void @bfmops( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: bfmops: ; CHECK: // %bb.0: -; CHECK-NEXT: bfmops za0.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: bfmops za3.s, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, %pn, %pm, %zn, %zm) ret void } define void @fmops( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: fmops: ; CHECK: // %bb.0: -; CHECK-NEXT: fmops za1.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: fmops za3.s, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, %pn, %pm, %zn, %zm) ret void } define void @smops_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: smops_s: ; CHECK: // %bb.0: -; CHECK-NEXT: smops za2.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: smops za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } define void @smops_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: smops_d: ; CHECK: // %bb.0: -; CHECK-NEXT: smops za0.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: smops za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } @@ -49,54 +49,54 @@ define void @umops_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: umops_d: ; CHECK: // %bb.0: -; CHECK-NEXT: umops za1.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: umops za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } define void @fmops_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: fmops_s: ; CHECK: // %bb.0: -; CHECK-NEXT: fmops za0.s, p0/m, p1/m, z0.s, z1.s +; CHECK-NEXT: fmops za3.s, p0/m, p1/m, z0.s, z1.s ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mops.nxv4f32(i32 0, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, %pn, %pm, %zn, %zm) ret void } define void @fmops_d( %pn, %pm, %zn, %zm) #1 { ; CHECK-LABEL: fmops_d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmops za2.d, p0/m, p1/m, z0.d, z1.d +; CHECK-NEXT: fmops za7.d, p0/m, p1/m, z0.d, z1.d ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.mops.nxv2f64(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, %pn, %pm, %zn, %zm) ret void } define void @sumops_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: sumops_s: ; CHECK: // %bb.0: -; CHECK-NEXT: sumops za1.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: sumops za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 1, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } define void @sumops_d( %pn, %pm, %zn, %zm) #0 { ; CHECK-LABEL: sumops_d: ; CHECK: // %bb.0: -; CHECK-NEXT: sumops za3.d, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: sumops za7.d, p0/m, p1/m, z0.h, z1.h ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 7, %pn, %pm, %zn, %zm) ret void } define void @usmops_s( %pn, %pm, %zn, %zm) { ; CHECK-LABEL: usmops_s: ; CHECK: // %bb.0: -; CHECK-NEXT: usmops za2.s, p0/m, p1/m, z0.b, z1.b +; CHECK-NEXT: usmops za3.s, p0/m, p1/m, z0.b, z1.b ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 2, %pn, %pm, %zn, %zm) + call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, %pn, %pm, %zn, %zm) ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; MOPA/MOPS + +define void @outer_sum_accumulate_s16( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: outer_sum_accumulate_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: smopa za3.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +define void @outer_sum_accumulate_u16( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: outer_sum_accumulate_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: umopa za3.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +define void @outer_sum_subtract_s16( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: outer_sum_subtract_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: smops za3.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +define void @outer_sum_subtract_u16( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: outer_sum_subtract_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: umops za3.s, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +; +; BMOPA/BMOPS +; + +define void @bitwise_outer_sum_accumulate_u32( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: bitwise_outer_sum_accumulate_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: bmopa za3.s, p0/m, p1/m, z0.s, z1.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +define void @bitwise_outer_sum_subtract_u32( %pn, %pm, %zn, %zm) { +; CHECK-LABEL: bitwise_outer_sum_subtract_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: bmops za3.s, p0/m, p1/m, z0.s, z1.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, %pn, %pm, %zn, %zm) + ret void +} + +declare void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32, , , , ) + +declare void @llvm.aarch64.sme.smops.za32.nxv8i16(i32, , , , ) +declare void @llvm.aarch64.sme.umops.za32.nxv8i16(i32, , , , ) + +declare void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32, , , , ) +declare void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32, , , , ) +