diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -415,7 +415,7 @@ let Arches = arches; } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; -def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAArch64 : TargetArch<["aarch64", "aarch64_be"]>; def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; @@ -2434,37 +2434,43 @@ } def ArmStreaming : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_streaming">]; + let Spellings = [RegularKeyword<"__arm_streaming">, + GNU<"arm_streaming">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; let Documentation = [ArmSmeStreamingDocs]; } def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_streaming_compatible">]; + let Spellings = [RegularKeyword<"__arm_streaming_compatible">, + GNU<"arm_streaming_compatible">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; let Documentation = [ArmSmeStreamingCompatibleDocs]; } def ArmSharedZA : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_shared_za">]; + let Spellings = [RegularKeyword<"__arm_shared_za">, + GNU<"arm_shared_za">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; let Documentation = [ArmSmeSharedZADocs]; } def ArmPreservesZA : TypeAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_preserves_za">]; + let Spellings = [RegularKeyword<"__arm_preserves_za">, + GNU<"arm_preserves_za">]; let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>; let Documentation = [ArmSmePreservesZADocs]; } def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_locally_streaming">]; + let Spellings = [RegularKeyword<"__arm_locally_streaming">, + GNU<"arm_locally_streaming">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [ArmSmeLocallyStreamingDocs]; } def ArmNewZA : InheritableAttr, TargetSpecificAttr { - let Spellings = [RegularKeyword<"__arm_new_za">]; + let Spellings = [RegularKeyword<"__arm_new_za">, + GNU<"arm_new_za">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [ArmSmeNewZADocs]; } diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def --- a/clang/include/clang/Basic/BuiltinsAArch64.def +++ b/clang/include/clang/Basic/BuiltinsAArch64.def @@ -68,6 +68,9 @@ TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte") TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte") +// SME state function +BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n") + // Memory Operations TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops") diff --git a/clang/include/clang/Basic/BuiltinsSVE.def b/clang/include/clang/Basic/BuiltinsSVE.def --- a/clang/include/clang/Basic/BuiltinsSVE.def +++ b/clang/include/clang/Basic/BuiltinsSVE.def @@ -19,4 +19,3 @@ #undef GET_SVE_BUILTINS #undef BUILTIN -#undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -78,15 +78,27 @@ clang_tablegen(arm_sve_sema_rangechecks.inc -gen-arm-sve-sema-rangechecks SOURCE arm_sve.td TARGET ClangARMSveSemaRangeChecks) +clang_tablegen(arm_sve_streaming_attrs.inc -gen-arm-sve-streaming-attrs + SOURCE arm_sve.td + TARGET ClangARMSveStreamingAttrs) clang_tablegen(arm_sme_builtins.inc -gen-arm-sme-builtins SOURCE arm_sme.td TARGET ClangARMSmeBuiltins) clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen SOURCE arm_sme.td TARGET ClangARMSmeBuiltinCG) +clang_tablegen(arm_sme_typeflags.inc -gen-arm-sme-typeflags + SOURCE arm_sme.td + TARGET ClangARMSmeTypeFlags) clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks SOURCE arm_sme.td TARGET ClangARMSmeSemaRangeChecks) +clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs + SOURCE arm_sme.td + TARGET ClangARMSmeStreamingAttrs) +clang_tablegen(arm_sme_builtins_za_state.inc -gen-arm-sme-builtin-za-state + SOURCE arm_sme.td + TARGET ClangARMSmeBuiltinsZAState) clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def SOURCE arm_cde.td TARGET ClangARMCdeBuiltinsDef) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3101,6 +3101,12 @@ def err_attribute_arm_feature_sve_bits_unsupported : Error< "%0 is only supported when '-msve-vector-bits=' is specified with a " "value of 128, 256, 512, 1024 or 2048.">; +def warn_attribute_arm_sm_incompat_builtin : Warning< + "builtin call has undefined behaviour when called from a %0 function">, + InGroup>; +def warn_attribute_arm_za_builtin_no_za_state : Warning< + "builtin call is not valid when calling from a function without active ZA state">, + InGroup>; def err_sve_vector_in_non_sve_target : Error< "SVE vector type %0 cannot be used in a target without sve">; def err_attribute_riscv_rvv_bits_unsupported : Error< diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -62,7 +62,7 @@ #include "clang/Basic/BuiltinsSME.def" FirstTSBuiltin, }; - } + } // namespace SME /// AArch64 builtins namespace AArch64 { @@ -309,6 +309,13 @@ bool isTupleSet() const { return Flags & IsTupleSet; } bool isReadZA() const { return Flags & IsReadZA; } bool isWriteZA() const { return Flags & IsWriteZA; } + bool isZASliceBaseOffsetIntr() const { + return Flags & IsZASliceBaseOffsetIntr; + } + bool isTileBaseOffsetIntr() const { + return Flags & IsTileBaseOffsetIntr; + } + bool isReductionQV() const { return Flags & IsReductionQV; } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -15,6 +15,33 @@ include "arm_sve_sme_incl.td" +class Inst ft, list ch, MemEltType met = MemEltTyDefault> { + string Name = n; + string Prototype = p; + string Types = t; + string TargetGuard = "sme"; + int Merge = MergeNone.Value; + string MergeSuffix = MergeNone.Suffix; + string LLVMIntrinsic = i; + list Flags = ft; + list ImmChecks = ch; + int MemEltType = met.Value; +} + +// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8") +class SInst ft = [], list ch = []> + : Inst { +} + +// MInst: Instructions which access memory +class MInst f, + MemEltType met = MemEltTyDefault, string i = "", + list ch = []> + : Inst { +} + //////////////////////////////////////////////////////////////////////////////// // Loads @@ -88,12 +115,12 @@ multiclass ZARead ch> { let TargetGuard = "sme" in { - def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPimi", t, - MergeOp1, i_prefix # "_horiz", + def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]_m", "ddPimi", t, + i_prefix # "_horiz", [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; - def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPimi", t, - MergeOp1, i_prefix # "_vert", + def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]_m", "ddPimi", t, + i_prefix # "_vert", [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>; } } @@ -109,12 +136,12 @@ multiclass ZAWrite ch> { let TargetGuard = "sme" in { - def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimiPd", t, - MergeOp1, i_prefix # "_horiz", + def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]_m", "vimiPd", t, + i_prefix # "_horiz", [IsWriteZA, IsStreaming, IsSharedZA], ch>; - def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimiPd", t, - MergeOp1, i_prefix # "_vert", + def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]_m", "vimiPd", t, + i_prefix # "_vert", [IsWriteZA, IsStreaming, IsSharedZA], ch>; } } @@ -129,10 +156,10 @@ // SME - Zero let TargetGuard = "sme" in { - def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero", + def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", "aarch64_sme_zero", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_255>]>; - def SVZERO_ZA : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero", + def SVZERO_ZA : SInst<"svzero_za", "v", "", "aarch64_sme_zero", [IsOverloadNone, IsStreamingCompatible, IsSharedZA]>; } @@ -141,7 +168,7 @@ multiclass ZACount { let TargetGuard = "sme" in { - def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone, + def NAME : SInst<"sv" # n_suffix, "nv", "", "aarch64_sme_" # n_suffix, [IsOverloadNone, IsStreamingCompatible, IsPreservesZA]>; } @@ -157,13 +184,13 @@ multiclass ZAAdd { let TargetGuard = "sme" in { - def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1, + def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]_m", "viPPd", "iUi", "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-i16i64" in { - def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1, + def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]_m", "viPPd", "lUl", "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>]>; } @@ -177,17 +204,17 @@ multiclass ZAIntOuterProd { let TargetGuard = "sme" in { - def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]", + def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]_m", "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c", - MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", + "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-i16i64" in { - def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]", + def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]_m", "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "s", - MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", + "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>]>; } @@ -203,19 +230,19 @@ multiclass ZAIntOuterProdMixedSigns { let TargetGuard = "sme" in { - def NAME # _ZA32_B: SInst<"sv" # n_suffix1 # n_suffix2 # "_za32[_{d}]", + def NAME # _ZA32_B: SInst<"sv" # n_suffix1 # n_suffix2 # "_za32[_{d}]_m", "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"), !cond(!eq(n_suffix1, "su") : "", true: "U") # "c", - MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", + "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-i16i64" in { - def NAME # _ZA64_H: SInst<"sv" # n_suffix1 # n_suffix2 # "_za64[_{d}]", + def NAME # _ZA64_H: SInst<"sv" # n_suffix1 # n_suffix2 # "_za64[_{d}]_m", "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"), !cond(!eq(n_suffix1, "su") : "", true: "U") # "s", - MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", + "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>]>; } @@ -231,25 +258,25 @@ multiclass ZAFPOuterProd { let TargetGuard = "sme" in { - def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h", - MergeOp1, "aarch64_sme_" # n_suffix # "_wide", + def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]_m", "viPPdd", "h", + "aarch64_sme_" # n_suffix # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; - def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b", - MergeOp1, "aarch64_sme_" # n_suffix # "_wide", + def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]_m", "viPPdd", "b", + "aarch64_sme_" # n_suffix # "_wide", [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; - def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f", - MergeOp1, "aarch64_sme_" # n_suffix, + def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]_m", "viPPdd", "f", + "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_3>]>; } let TargetGuard = "sme-f64f64" in { - def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d", - MergeOp1, "aarch64_sme_" # n_suffix, + def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]_m", "viPPdd", "d", + "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA], [ImmCheck<0, ImmCheck0_7>]>; } @@ -257,3 +284,428 @@ defm SVMOPA : ZAFPOuterProd<"mopa">; defm SVMOPS : ZAFPOuterProd<"mops">; + +// FMLA/FMLS +let TargetGuard = "sme2" in { + def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vmi22", "f", "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vmi44", "f", "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vmi2d", "f", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vmi4d", "f", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vmi2di", "f", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vmi4di", "f", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; +} + +let TargetGuard = "sme2,sme-f64f64" in { + def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vmi22", "d", "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vmi44", "d", "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vmi2d", "d", "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vmi4d", "d", "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vmi2di", "d", "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vmi4di", "d", "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; +} + +// FMLAL/FMLSL/UMLAL/SMLAL +// SMLALL/UMLALL/USMLALL/SUMLALL +let TargetGuard = "sme2" in { + // MULTI MLAL + def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vmi22", "c", "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vmi22", "Uc", "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vmi44", "c", "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vmi44", "Uc", "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vmi22", "s", "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vmi22", "Us", "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vmi44", "s", "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vmi44", "Us", "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // MULTI MLSL + def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "bh", "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "bh", "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "s", "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "s", "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vmi22", "Us", "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vmi44", "Us", "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vmi22", "c", "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vmi22", "Uc", "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vmi44", "c", "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vmi44", "Uc", "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + +let TargetGuard = "sme-i16i64" in { + def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vmi22", "s", "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vmi22", "Us", "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vmi44", "s", "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vmi44", "Us", "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // SINGLE MLAL + def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLAL_SINGLE_VG4x1_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x1", "vmidd", "c", "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x1_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x1", "vmidd", "Uc", "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vmi2d", "c", "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vmi2d", "Uc", "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_S8 : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vmi4d", "c", "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_U8 : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vmi4d", "Uc", "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x1", "vmidd", "s", "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x1", "vmidd", "Us", "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vmi2d", "s", "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vmi2d", "Us", "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vmi4d", "s", "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vmi4d", "Us", "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // SINGLE MLSL + def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "bh", "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "bh", "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "bh", "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "s", "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "s", "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "s", "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x1", "vmidd", "Us", "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>]>; + def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vmi2d", "Us", "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vmi4d", "Us", "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>]>; + + def SVMLSL_SINGLE_VG4x1_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x1", "vmidd", "c", "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x1_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x1", "vmidd", "Uc", "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vmi2d", "c", "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vmi2d", "Uc", "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_S8 : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vmi4d", "c", "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_U8 : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vmi4d", "Uc", "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x1", "vmidd", "s", "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x1", "vmidd", "Us", "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vmi2d", "s", "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vmi2d", "Us", "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vmi4d", "s", "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vmi4d", "Us", "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + } + + // INDEXED MLAL + def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + + def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmiddi", "c", "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmiddi", "Uc", "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vmi2di", "c", "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vmi2di", "Uc", "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vmi4di", "c", "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vmi4di", "Uc", "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmiddi", "s", "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmiddi", "Us", "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vmi2di", "s", "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vmi2di", "Us", "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vmi4di", "s", "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vmi4di", "Us", "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + } + + // INDEXED MLSL + def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "bh", "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "bh", "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "bh", "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "s", "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "s", "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "s", "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmiddi", "Us", "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_14_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vmi2di", "Us", "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vmi4di", "Us", "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_6_Mul2>, ImmCheck<4, ImmCheck0_7>]>; + + def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmiddi", "c", "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmiddi", "Uc", "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vmi2di", "c", "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vmi2di", "Uc", "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vmi4di", "c", "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vmi4di", "Uc", "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + let TargetGuard = "sme-i16i64" in { + def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmiddi", "s", "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmiddi", "Us", "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vmi2di", "s", "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vmi2di", "Us", "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vmi4di", "s", "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vmi4di", "Us", "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_7>]>; + } + + // SINGLE SUMLALL + def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_u8]_vg4x2", "vmi2.xd", "Uc", "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_u8]_vg4x4", "vmi4.xd", "Uc", "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // INDEXED SUMLALL + def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmidui", "c", "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vmi2ui", "c", "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vmi4ui", "c", "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + // SINGLE USMLALL + def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla[_single]_za32[_{d}]_vg4x1", "vmiud", "c", "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>]>; + def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vmi2.ud", "c", "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vmi4.ud", "c", "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // MULTI USMLALL + def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vmi2.u2", "c", "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vmi4.u4", "c", "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>]>; + + // INDEXED USMLALL + def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmidxi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_12_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vmi2xi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vmi4xi", "Uc", "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_4_Mul4>, ImmCheck<4, ImmCheck0_15>]>; + + // VERTICAL DOT-PRODUCT + def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "s", "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "c", "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "Us", "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "Uc", "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vmi2di", "hb", "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "c", "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vmi4di", "Uc", "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + let TargetGuard = "sme-i16i64" in { + def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vmi4di", "s", "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vmi4di", "Us", "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + } + + // Multi-vector signed & unsigned integer dot-product + def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "cs", "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "cs", "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "UcUs", "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "UcUs", "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_S : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "cs", "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_S : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "cs", "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_U : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "UcUs", "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_U : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "UcUs", "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA32_VG1x2_S : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "cs", "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_S : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "cs", "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x2_U : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "UcUs", "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_U : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "UcUs", "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vmi2.dx", "Uc", "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vmi4.dx", "Uc", "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_MULTI_ZA32_VG1x2 : Inst<"svusdot_za32[_{d}]_vg1x2", "vmi2.d2.x", "Uc", "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_MULTI_ZA32_VG1x4 : Inst<"svusdot_za32[_{d}]_vg1x4", "vmi4.d4.x", "Uc", "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVUSDOT_LANE_ZA32_VG1x2 : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vmi2.dxi", "Uc", "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVUSDOT_LANE_ZA32_VG1x4 : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vmi4.dxi", "Uc", "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vmi2.du", "c", "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vmi4.du", "c", "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUDOT_LANE_ZA32_VG1x2 : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vmi2.dui", "c", "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVSUDOT_LANE_ZA32_VG1x4 : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vmi4.dui", "c", "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + + let TargetGuard = "sme-i16i64" in { + def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vmi2d", "s", "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vmi4d", "s", "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vmi2d", "Us", "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vmi4d", "Us", "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x2_S16 : Inst<"svdot_za64[_{d}]_vg1x2", "vmi22", "s", "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x4_S16 : Inst<"svdot_za64[_{d}]_vg1x4", "vmi44", "s", "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x2_U16 : Inst<"svdot_za64[_{d}]_vg1x2", "vmi22", "Us", "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA64_VG1x4_U16 : Inst<"svdot_za64[_{d}]_vg1x4", "vmi44", "Us", "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA64_VG1x2_S16 : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vmi2di", "s", "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x4_S16 : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vmi4di", "s", "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x2_U16 : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vmi2di", "Us", "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + def SVDOT_LANE_ZA64_VG1x4_U16 : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vmi4di", "Us", "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>; + } + + // Multi-vector half-precision/BFloat16 floating-point dot-product + def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vmi2d", "bh", "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vmi4d", "bh", "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x2_F16 : Inst<"svdot_za32[_{d}]_vg1x2", "vmi22", "bh", "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_MULTI_ZA32_VG1x4_F16 : Inst<"svdot_za32[_{d}]_vg1x4", "vmi44", "bh", "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVDOT_LANE_ZA32_VG1x2_F16 : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vmi2di", "bh", "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; + def SVDOT_LANE_ZA32_VG1x4_F16 : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vmi4di", "bh", "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>, ImmCheck<4, ImmCheck0_3>]>; +} + +// == ADD /SUB == +let TargetGuard = "sme2" in { + def SVADD_WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"svadd_write[_single]_za32[_{d}]_vg1x2", "vmi2d", "iUi", "aarch64_sme_add_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"svadd_write[_single]_za32[_{d}]_vg1x4", "vmi4d", "iUi", "aarch64_sme_add_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"svsub_write[_single]_za32[_{d}]_vg1x2", "vmi2d", "Ui", "aarch64_sme_sub_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"svsub_write[_single]_za32[_{d}]_vg1x4", "vmi4d", "Ui", "aarch64_sme_sub_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_WRITE_ZA32_VG1x2_I32 : Inst<"svadd_write_za32[_{d}]_vg1x2", "vmi22", "iUi", "aarch64_sme_add_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_ZA32_VG1x4_I32 : Inst<"svadd_write_za32[_{d}]_vg1x4", "vmi44", "iUi", "aarch64_sme_add_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_ZA32_VG1x2_I32 : Inst<"svsub_write_za32[_{d}]_vg1x2", "vmi22", "Ui", "aarch64_sme_sub_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_ZA32_VG1x4_I32 : Inst<"svsub_write_za32[_{d}]_vg1x4", "vmi44", "Ui", "aarch64_sme_sub_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_ZA32_VG1x2_I32 : Inst<"svadd_za32[_{d}]_vg1x2", "vmi2", "iUif", "aarch64_sme_add_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA32_VG1X4_I32 : Inst<"svadd_za32[_{d}]_vg1x4", "vmi4", "iUif", "aarch64_sme_add_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA32_VG1X2_I32 : Inst<"svsub_za32[_{d}]_vg1x2", "vmi2", "Uif", "aarch64_sme_sub_za32_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA32_VG1X4_I32 : Inst<"svsub_za32[_{d}]_vg1x4", "vmi4", "Uif", "aarch64_sme_sub_za32_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + let TargetGuard = "sme-i16i64" in { + def SVADD_WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"svadd_write[_single]_za64[_{d}]_vg1x2", "vmi2d", "lUl", "aarch64_sme_add_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"svadd_write[_single]_za64[_{d}]_vg1x4", "vmi4d", "lUl", "aarch64_sme_add_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"svsub_write[_single]_za64[_{d}]_vg1x2", "vmi2d", "Ul", "aarch64_sme_sub_write_single_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"svsub_write[_single]_za64[_{d}]_vg1x4", "vmi4d", "Ul", "aarch64_sme_sub_write_single_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_WRITE_ZA64_VG1x2_I64 : Inst<"svadd_write_za64[_{d}]_vg1x2", "vmi22", "lUl", "aarch64_sme_add_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_WRITE_ZA64_VG1x4_I64 : Inst<"svadd_write_za64[_{d}]_vg1x4", "vmi44", "lUl", "aarch64_sme_add_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_WRITE_ZA64_VG1x2_I64 : Inst<"svsub_write_za64[_{d}]_vg1x2", "vmi22", "Ul", "aarch64_sme_sub_write_za_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_WRITE_ZA64_VG1x4_I64 : Inst<"svsub_write_za64[_{d}]_vg1x4", "vmi44", "Ul", "aarch64_sme_sub_write_za_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVADD_ZA64_VG1X2_I64 : Inst<"svadd_za64[_{d}]_vg1x2", "vmi2", "lUl", "aarch64_sme_add_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA64_VG1X4_I64 : Inst<"svadd_za64[_{d}]_vg1x4", "vmi4", "lUl", "aarch64_sme_add_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA64_VG1X2_I64 : Inst<"svsub_za64[_{d}]_vg1x2", "vmi2", "Ul", "aarch64_sme_sub_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA64_VG1X4_I64 : Inst<"svsub_za64[_{d}]_vg1x4", "vmi4", "Ul", "aarch64_sme_sub_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + } + + let TargetGuard = "sme-f64f64" in { + def SVADD_ZA64_VG1X2_F64 : Inst<"svadd_za64[_{d}]_vg1x2", "vmi2", "d", "aarch64_sme_add_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVADD_ZA64_VG1X4_F64 : Inst<"svadd_za64[_{d}]_vg1x4", "vmi4", "d", "aarch64_sme_add_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + + def SVSUB_ZA64_VG1X2_F64 : Inst<"svsub_za64[_{d}]_vg1x2", "vmi2", "d", "aarch64_sme_sub_za64_vg1x2", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVSUB_ZA64_VG1X4_F64 : Inst<"svsub_za64[_{d}]_vg1x4", "vmi4", "d", "aarch64_sme_sub_za64_vg1x4", [IsStreaming, IsSharedZA, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + } +} +// +// 2 and 4 vector-group read/write intrinsics. +// + +multiclass WriteHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2", "vimi2", t, i # "_hor_vg2", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2", "vimi2", t, i # "_ver_vg2", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4", "vimi4", t, i # "_hor_vg4", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4", "vimi4", t, i # "_ver_vg4", [IsSharedZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; +} + +let TargetGuard = "sme2" in { + defm SVWRITE_ZA8 : WriteHV_VG<"za8[_{d}]", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_14_Mul2>], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_12_Mul4>]>; + defm SVWRITE_ZA16 : WriteHV_VG<"za16[_{d}]", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_6_Mul2>], [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_4_Mul4>]>; + defm SVWRITE_ZA32 : WriteHV_VG<"za32[_{d}]", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_2_Mul2>], [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_0>]>; + defm SVWRITE_ZA64 : WriteHV_VG<"za64[_{d}]", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_0>], [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_0>]>; +} + +multiclass ReadHV_VG checkvg2, list checkvg4> { + def NAME # _VG2_H : Inst<"svread_hor_" # n # "_vg2", "2imi", t, i # "_hor_vg2", [IsSharedZA, IsPreservesZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG2_V : Inst<"svread_ver_" # n # "_vg2", "2imi", t, i # "_ver_vg2", [IsSharedZA, IsPreservesZA, IsStreaming, IsTileBaseOffsetIntr], checkvg2>; + def NAME # _VG4_H : Inst<"svread_hor_" # n # "_vg4", "4imi", t, i # "_hor_vg4", [IsSharedZA, IsPreservesZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; + def NAME # _VG4_V : Inst<"svread_ver_" # n # "_vg4", "4imi", t, i # "_ver_vg4", [IsSharedZA, IsPreservesZA, IsStreaming, IsTileBaseOffsetIntr], checkvg4>; +} + +let TargetGuard = "sme2" in { + defm SVREAD_ZA8 : ReadHV_VG<"za8_{d}", "cUc", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_14_Mul2>], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_12_Mul4>]>; + defm SVREAD_ZA16 : ReadHV_VG<"za16_{d}", "sUshb", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_6_Mul2>], [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_4_Mul4>]>; + defm SVREAD_ZA32 : ReadHV_VG<"za32_{d}", "iUif", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_2_Mul2>], [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_0>]>; + defm SVREAD_ZA64 : ReadHV_VG<"za64_{d}", "lUld", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_0>], [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_0>]>; +} + + +// +// Single vector-group read/write intrinsics. +// + +let TargetGuard = "sme2" in { + def SVWRITE_ZA64_VG1x2 : Inst<"svwrite_za64[_{d}]_vg1x2", "vmi2", "lUld", "aarch64_sme_write_vg1x2", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVWRITE_ZA64_VG1x4 : Inst<"svwrite_za64[_{d}]_vg1x4", "vmi4", "lUld", "aarch64_sme_write_vg1x4", [IsSharedZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x2 : Inst<"svread_za64_{d}_vg1x2", "2mi", "lUld", "aarch64_sme_read_vg1x2", [IsSharedZA, IsPreservesZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; + def SVREAD_ZA64_VG1x4 : Inst<"svread_za64_{d}_vg1x4", "4mi", "lUld", "aarch64_sme_read_vg1x4", [IsSharedZA, IsPreservesZA, IsStreaming, IsZASliceBaseOffsetIntr], [ImmCheck<1, ImmCheck0_7>]>; +} + +// +// Outer produce and accumulate/subtract +// + +let TargetGuard = "sme2" in { + def SVSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", "aarch64_sme_smopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", "aarch64_sme_umopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "s", "aarch64_sme_smops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + def SVUSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "Us", "aarch64_sme_umops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "Ui", "aarch64_sme_bmopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; + + def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "Ui", "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>; +} + +// +// Spill and fill of ZT0 +// + +let TargetGuard = "sme2" in { + def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; + def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; +} + +// +// Zero ZT0 +// + +let TargetGuard = "sme2" in { + def SVZERO_ZT : Inst<"svzero_zt", "vi", "", "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; +} + +// +// lookup table expand one register +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt[_{d}]", "didi", "UcUsUi", "aarch64_sme_luti2_lane_zt", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>; + def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt[_{d}]", "didi", "UcUsUi", "aarch64_sme_luti4_lane_zt", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>; +} + +// +// lookup table expand two contiguous registers +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt[_{d}]_x2", "2.didi", "UcUsUi", "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>; + def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt[_{d}]_x2", "2.didi", "UcUsUi", "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; +} + +// +// lookup table expand four contiguous registers +// + +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt[_{d}]_x4", "4.didi", "UcUsUi", "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt[_{d}]_x4", "4.didi", "UsUi", "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>; +} diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -15,31 +15,74 @@ include "arm_sve_sme_incl.td" +class Inst ft, list ch, MemEltType met> { + string Name = n; + string Prototype = p; + string Types = t; + string TargetGuard = "sve"; + int Merge = mt.Value; + string MergeSuffix = mt.Suffix; + string LLVMIntrinsic = i; + list Flags = ft; + list ImmChecks = ch; + int MemEltType = met.Value; +} + +// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8") +class SInst ft = [], list ch = []> + : Inst { +} + +// MInst: Instructions which access memory +class MInst f, + MemEltType met = MemEltTyDefault, string i = "", + list ch = []> + : Inst { +} + //////////////////////////////////////////////////////////////////////////////// // Loads // Load one vector (scalar base) -def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1 : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; let TargetGuard = "sve,bf16" in { - def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; - def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +} + +let TargetGuard = "sve2p1" in { + // Contiguous zero-extending load to quadword (single vector). + def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">; + def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">; + + def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">; + def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">; + + // Contiguous truncating store from quadword (single vector). + def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">; + def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">; + + def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">; + def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">; } // Load one vector (scalar base, VL displacement) -def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; -def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn], MemEltTyInt8, "aarch64_sve_ld1">; -def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn], MemEltTyInt16, "aarch64_sve_ld1">; -def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; -def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">; +def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_ld1">; +def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl", [IsLoad, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_ld1">; +def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl", [IsLoad, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; +def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_ld1">; // Load one vector (vector base) def SVLD1_GATHER_BASES_U : MInst<"svld1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">; @@ -243,25 +286,25 @@ } // Load one vector, unextended load, non-temporal (scalar base) -def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; // Load one vector, unextended load, non-temporal (scalar base, VL displacement) -def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; let TargetGuard = "sve,bf16" in { - def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; - def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">; } // Load one quadword and replicate (scalar base) -def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">; + def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>; } multiclass StructLoad { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { def: SInst; } @@ -286,42 +329,42 @@ } let TargetGuard = "sve,bf16" in { - def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; + def SVBFDOT : SInst<"svbfdot[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; def SVBFMMLA : SInst<"svbfmmla[_{0}]", "MMdd", "b", MergeNone, "aarch64_sve_bfmmla", [IsOverloadNone]>; - def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone]>; - def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone]>; - def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone]>; - def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; - def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; - def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfdot", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalb", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "MMda", "b", MergeNone, "aarch64_sve_bfmlalt", [IsOverloadNone, IsStreamingCompatible]>; + def SVBFDOT_LANE : SInst<"svbfdot_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; } //////////////////////////////////////////////////////////////////////////////// // Stores // Store one vector (scalar base) -def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1 : MInst<"svst1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; // Store one vector (scalar base, VL displacement) -def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; -def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore], MemEltTyInt8, "aarch64_sve_st1">; -def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore], MemEltTyInt16, "aarch64_sve_st1">; -def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; -def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1_VNUM : MInst<"svst1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; +def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl", [IsStore, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_st1">; +def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl", [IsStore, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_st1">; +def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; +def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul", [IsStore, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_st1">; let TargetGuard = "sve,bf16" in { - def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; - def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_BF : MInst<"svst1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; + def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">; } // Store one vector (vector base) @@ -394,7 +437,7 @@ def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">; multiclass StructStore { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { def: SInst; } @@ -410,30 +453,30 @@ defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">; // Store one vector, with no truncation, non-temporal (scalar base) -def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; // Store one vector, with no truncation, non-temporal (scalar base, VL displacement) -def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; +def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; let TargetGuard = "sve,bf16" in { - def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; - def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_BF : MInst<"svstnt1[_{d}]", "vPpd", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; + def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">; } //////////////////////////////////////////////////////////////////////////////// // Prefetches // Prefetch (Scalar base) -def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Scalar base, VL displacement) -def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch], MemEltTyInt8, "aarch64_sve_prf">; -def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">; -def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">; -def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">; +def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8, "aarch64_sve_prf">; +def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">; +def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">; +def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">; // Prefetch (Vector bases) def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8, "aarch64_sve_prfb_gather_scalar_offset">; @@ -479,16 +522,16 @@ //////////////////////////////////////////////////////////////////////////////// // Scalar to vector -def SVDUPQ_8 : SInst<"svdupq[_n]_{d}", "dssssssssssssssss", "cUc", MergeNone>; -def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss", "sUsh", MergeNone>; +def SVDUPQ_8 : SInst<"svdupq[_n]_{d}", "dssssssssssssssss", "cUc", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss", "sUsh", MergeNone, "", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss", "b", MergeNone>; + def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss", "b", MergeNone, "", [IsStreamingCompatible]>; } -def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss", "iUif", MergeNone>; -def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone>; +def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss", "iUif", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss", "lUld", MergeNone, "", [IsStreamingCompatible]>; multiclass svdup_base { - def NAME : SInst; + def NAME : SInst; let TargetGuard = "sve,bf16" in { def _BF16: SInst; } @@ -499,14 +542,14 @@ defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs", MergeAnyExp, "aarch64_sve_dup">; defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs", MergeZeroExp, "aarch64_sve_dup">; -def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index">; +def SVINDEX : SInst<"svindex_{d}", "dss", "csilUcUsUiUl", MergeNone, "aarch64_sve_index", [IsStreamingCompatible]>; // Integer arithmetic -multiclass SInstZPZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; +multiclass SInstZPZ { + def _M : SInst; + def _X : SInst; + def _Z : SInst; } defm SVABS : SInstZPZ<"svabs", "csil", "aarch64_sve_abs">; @@ -515,13 +558,13 @@ //------------------------------------------------------------------------------ multiclass SInstZPZZ flags=[]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; } defm SVABD_S : SInstZPZZ<"svabd", "csil", "aarch64_sve_sabd", "aarch64_sve_sabd_u">; @@ -541,6 +584,12 @@ defm SVSUB : SInstZPZZ<"svsub", "csilUcUsUiUl", "aarch64_sve_sub", "aarch64_sve_sub_u">; defm SVSUBR : SInstZPZZ<"svsubr", "csilUcUsUiUl", "aarch64_sve_subr", "aarch64_sve_sub_u", [ReverseMergeAnyBinOp]>; +let TargetGuard = "sve2p1,b16b16" in { + defm SVMUL_BF : SInstZPZZ<"svmul", "b", "aarch64_sve_fmul", "aarch64_sve_fmul_u">; + defm SVADD_BF : SInstZPZZ<"svadd", "b", "aarch64_sve_fadd", "aarch64_sve_fadd_u">; + defm SVSUB_BF : SInstZPZZ<"svsub", "b", "aarch64_sve_fsub", "aarch64_sve_fsub_u">; +} + //------------------------------------------------------------------------------ multiclass SInstZPZZZ flags=[]> { @@ -553,29 +602,29 @@ def _N_Z : SInst; } -defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp]>; -defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u">; -defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u">; -defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp]>; +defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>; +defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u", [IsStreamingCompatible]>; +defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u", [IsStreamingCompatible]>; +defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>; //------------------------------------------------------------------------------ -def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_S : SInst<"svdot[_{0}]", "ddqq", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_U : SInst<"svdot[_{0}]", "ddqq", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_S : SInst<"svqadd[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_U : SInst<"svqadd[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_S : SInst<"svqsub[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_U : SInst<"svqsub[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot">; -def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot">; -def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x">; -def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">; -def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x">; -def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">; +def SVDOT_N_S : SInst<"svdot[_n_{0}]", "ddqr", "il", MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>; +def SVDOT_N_U : SInst<"svdot[_n_{0}]", "ddqr", "UiUl", MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>; +def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>; +def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>; +def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>; +def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>; -def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_sdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]", "ddqqi", "UiUl", MergeNone, "aarch64_sve_udot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; //////////////////////////////////////////////////////////////////////////////// // Logical operations @@ -592,107 +641,115 @@ // Shifts multiclass SInst_SHIFT { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; - def _N_M : SInst; - def _N_X : SInst; - def _N_Z : SInst; + def _N_M : SInst; + def _N_X : SInst; + def _N_Z : SInst; - def _WIDE_M : SInst; - def _WIDE_X : SInst; - def _WIDE_Z : SInst; + def _WIDE_M : SInst; + def _WIDE_X : SInst; + def _WIDE_Z : SInst; - def _WIDE_N_M : SInst; - def _WIDE_N_X : SInst; - def _WIDE_N_Z : SInst; + def _WIDE_N_M : SInst; + def _WIDE_N_X : SInst; + def _WIDE_N_Z : SInst; } defm SVASR : SInst_SHIFT<"svasr", "aarch64_sve_asr", "csil", "csi">; defm SVLSL : SInst_SHIFT<"svlsl", "aarch64_sve_lsl", "csilUcUsUiUl", "csiUcUsUi">; defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">; -def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">; +def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr">; + def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds", "b", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Integer reductions -def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv">; -def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv">; -def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv">; -def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv">; -def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv">; -def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv">; -def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv">; -def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv">; -def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv">; +def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil", MergeNone, "aarch64_sve_saddv", [IsStreamingCompatible]>; +def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl", MergeNone, "aarch64_sve_uaddv", [IsStreamingCompatible]>; +def SVANDV : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv", [IsStreamingCompatible]>; +def SVEORV : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv", [IsStreamingCompatible]>; +def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_smaxv", [IsStreamingCompatible]>; +def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxv", [IsStreamingCompatible]>; +def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil", MergeNone, "aarch64_sve_sminv", [IsStreamingCompatible]>; +def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl", MergeNone, "aarch64_sve_uminv", [IsStreamingCompatible]>; +def SVORV : SInst<"svorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv", [IsStreamingCompatible]>; +def SVORQV : SInst<"svorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv", [IsReductionQV]>; +def SVEORQV : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv", [IsReductionQV]>; +def SVADDQV : SInst<"svaddqv[_{d}]", "{Pd", "hfdcsilUcUsUiUl", MergeNone, "aarch64_sve_addqv", [IsReductionQV]>; +def SVANDQV : SInst<"svandqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andqv", [IsReductionQV]>; +def SVSMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_smaxqv", [IsReductionQV]>; +def SVUMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxqv", [IsReductionQV]>; +def SVSMINQV : SInst<"svminqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_sminqv", [IsReductionQV]>; +def SVUMINQV : SInst<"svminqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_uminqv", [IsReductionQV]>; //////////////////////////////////////////////////////////////////////////////// // Integer comparisons -def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; -def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; - -def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">; -def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">; -def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge">; -def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt">; -def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>; -def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>; -def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs">; -def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi">; -def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>; -def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>; - -def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; - -def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide">; -def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide">; -def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide">; -def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide">; -def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide">; -def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide">; -def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">; -def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">; -def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">; -def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">; +def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>; +def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>; +def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>; +def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>; +def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil", MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>; +def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>; +def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl", MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; + +def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>; +def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>; +def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>; +def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>; +def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>; +def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi", MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>; +def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>; +def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>; +def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>; +def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // While comparisons -def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>; -def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>; -def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>; -def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; -def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>; +def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Counting bit @@ -703,12 +760,12 @@ def _Z : SInst; } -defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls">; -defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz">; -defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt">; +defm SVCLS : SInstCLS<"svcls", "csil", "aarch64_sve_cls", [IsStreamingCompatible]>; +defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl", "aarch64_sve_clz", [IsStreamingCompatible]>; +defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt">; + defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -748,6 +805,13 @@ defm SVSUB_F : SInstZPZZ<"svsub", "hfd", "aarch64_sve_fsub", "aarch64_sve_fsub_u">; defm SVSUBR_F : SInstZPZZ<"svsubr", "hfd", "aarch64_sve_fsubr", "aarch64_sve_fsub_u", [ReverseMergeAnyBinOp]>; +let TargetGuard = "sve2p1,b16b16" in { + defm SVMAXNM_BF : SInstZPZZ<"svmaxnm","b", "aarch64_sve_fmaxnm", "aarch64_sve_fmaxnm_u">; + defm SVMINNM_BF : SInstZPZZ<"svminnm","b", "aarch64_sve_fminnm", "aarch64_sve_fminnm_u">; + defm SVMAX_BF : SInstZPZZ<"svmax", "b", "aarch64_sve_fmax", "aarch64_sve_fmax_u">; + defm SVMIN_BF : SInstZPZZ<"svmin", "b", "aarch64_sve_fmin", "aarch64_sve_fmin_u">; +} + defm SVRECPX : SInstZPZ<"svrecpx", "hfd", "aarch64_sve_frecpx">; defm SVRINTA : SInstZPZ<"svrinta", "hfd", "aarch64_sve_frinta">; defm SVRINTI : SInstZPZ<"svrinti", "hfd", "aarch64_sve_frinti">; @@ -763,79 +827,94 @@ def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftsmul_x">; def SVTSSEL : SInst<"svtssel[_{d}]", "ddu", "hfd", MergeNone, "aarch64_sve_ftssel_x">; -def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale">; - -def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale">; -def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale">; -def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale">; - -defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad", "aarch64_sve_fmla_u", [ReverseMergeAnyAccOp]>; -defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla", "aarch64_sve_fmla_u">; -defm SVMLS_F : SInstZPZZZ<"svmls", "hfd", "aarch64_sve_fmls", "aarch64_sve_fmls_u">; -defm SVMSB_F : SInstZPZZZ<"svmsb", "hfd", "aarch64_sve_fmsb", "aarch64_sve_fmls_u", [ReverseMergeAnyAccOp]>; -defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [ReverseMergeAnyAccOp]>; -defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u">; -defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u">; -defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [ReverseMergeAnyAccOp]>; - -def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>; -def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>; - -def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSCALE_M : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_X : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_Z : SInst<"svscale[_{d}]", "dPdx", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; + +def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny, "aarch64_sve_fscale", [IsStreamingCompatible]>; +def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>; + +defm SVMAD_F : SInstZPZZZ<"svmad", "hfd", "aarch64_sve_fmad", "aarch64_sve_fmla_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVMLA_F : SInstZPZZZ<"svmla", "hfd", "aarch64_sve_fmla", "aarch64_sve_fmla_u", [IsStreamingCompatible]>; +defm SVMLS_F : SInstZPZZZ<"svmls", "hfd", "aarch64_sve_fmls", "aarch64_sve_fmls_u", [IsStreamingCompatible]>; +defm SVMSB_F : SInstZPZZZ<"svmsb", "hfd", "aarch64_sve_fmsb", "aarch64_sve_fmls_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; +defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u", [IsStreamingCompatible]>; +defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u", [IsStreamingCompatible]>; +defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>; + +def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeOp1, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeAny, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi", "hfd", MergeZero, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>; +def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; +def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>; + +let TargetGuard = "sve2p1,b16b16" in { + defm SVMLA_BF : SInstZPZZZ<"svmla", "b", "aarch64_sve_fmla", "aarch64_sve_fmla_u", [IsStreamingCompatible]>; + defm SVMLS_BF : SInstZPZZZ<"svmls", "b", "aarch64_sve_fmls", "aarch64_sve_fmls_u", [IsStreamingCompatible]>; +} + +def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf", MergeNone, "aarch64_sve_fcmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE : SInst<"svmla_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE : SInst<"svmls_lane[_{d}]", "ddddi", "hfd", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE : SInst<"svmul_lane[_{d}]", "dddi", "hfd", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; + +let TargetGuard = "sve2p1,b16b16" in { + def SVMLA_LANE_BF : SInst<"svmla_lane[_{d}]", "ddddi", "b", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + def SVMLS_LANE_BF : SInst<"svmls_lane[_{d}]", "ddddi", "b", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + def SVMUL_LANE_BF : SInst<"svmul_lane[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +} -def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x">; -def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x">; -def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x">; -def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x">; +def SVRECPE : SInst<"svrecpe[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frecpe_x", [IsStreamingCompatible]>; +def SVRECPS : SInst<"svrecps[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x", [IsStreamingCompatible]>; +def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd", "hfd", MergeNone, "aarch64_sve_frsqrte_x", [IsStreamingCompatible]>; +def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point reductions def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">; -def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv">; -def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv">; -def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv">; -def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv">; -def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv">; +def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv", [IsStreamingCompatible]>; +def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv", [IsStreamingCompatible]>; +def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv", [IsStreamingCompatible]>; +def SVFMINV : SInst<"svminv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminv", [IsStreamingCompatible]>; +def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fminnmv", [IsStreamingCompatible]>; +def SVFMAXNMQV: SInst<"svmaxnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxnmqv", [IsReductionQV]>; +def SVFMINNMQV: SInst<"svminnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminnmqv", [IsReductionQV]>; +def SVFMAXQV: SInst<"svmaxqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxqv", [IsReductionQV]>; +def SVFMINQV: SInst<"svminqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv", [IsReductionQV]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point comparisons -def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo">; - -def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge">; -def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt">; -def SVACLE_N : SInst<"svacle[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>; -def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>; -def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">; - -def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; - -def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq">; -def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne">; -def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge">; -def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt">; -def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>; -def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>; +def SVACGE : SInst<"svacge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT : SInst<"svacgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE : SInst<"svacle[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT : SInst<"svaclt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; + +def SVACGE_N : SInst<"svacge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>; +def SVACGT_N : SInst<"svacgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>; +def SVACLE_N : SInst<"svacle[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>; +def SVACLT_N : SInst<"svaclt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>; + +def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; + +def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>; +def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>; +def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>; +def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>; +def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>; +def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Floating-point conversions @@ -843,16 +922,16 @@ multiclass SInstCvtMXZ< string name, string m_types, string xz_types, string types, string intrinsic, list flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; - def _Z : SInst; + def _M : SInst; + def _X : SInst; + def _Z : SInst; } multiclass SInstCvtMX flags = [IsOverloadNone]> { - def _M : SInst; - def _X : SInst; + def _M : SInst; + def _X : SInst; } // svcvt_s##_f16 @@ -866,7 +945,7 @@ let TargetGuard = "sve,bf16" in { defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_fcvt_bf16f32">; - def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone]>; + def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b", MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, IsStreamingCompatible]>; } // svcvt_s##_f64 @@ -930,11 +1009,11 @@ defm SVCVTX_F32 : SInstCvtMXZ<"svcvtx_f32[_f64]", "MMPd", "MPd", "d", "aarch64_sve_fcvtx_f32f64">; -def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone]>; -def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone]>; +def SVCVTNT_F32 : SInst<"svcvtnt_f16[_f32]", "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone, IsStreamingCompatible]>; +def SVCVTNT_F64 : SInst<"svcvtnt_f32[_f64]", "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTNT_X : Implemented as macro by SveEmitter.cpp -def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone]>; +def SVCVTXNT_F32 : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>; // SVCVTXNT_X_F32 : Implemented as macro by SveEmitter.cpp } @@ -943,9 +1022,9 @@ // Permutations and selection multiclass SVEPerm { - def : SInst; + def : SInst; let TargetGuard = "sve,bf16" in { - def: SInst; + def: SInst; } } @@ -959,129 +1038,133 @@ // splat of any possible lane. It is upto LLVM to pick a more efficient // instruction such as DUP (indexed) if the lane index fits the range of the // instruction's immediate. -def SVDUP_LANE : SInst<"svdup_lane[_{d}]", "ddL", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVDUP_LANE : SInst<"svdup_lane[_{d}]", "ddL", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { def SVDUP_LANE_BF16 : - SInst<"svdup_lane[_{d}]", "ddL", "b", MergeNone, "aarch64_sve_tbl">; + SInst<"svdup_lane[_{d}]", "ddL", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVDUPQ_LANE : SInst<"svdupq_lane[_{d}]", "ddn", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane">; +def SVDUPQ_LANE : SInst<"svdupq_lane[_{d}]", "ddn", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane">; + def SVDUPQ_LANE_BF16 : SInst<"svdupq_lane[_{d}]", "ddn", "b", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>; } -def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVEXT : SInst<"svext[_{d}]", "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; defm SVLASTA : SVEPerm<"svlasta[_{d}]", "sPd", "aarch64_sve_lasta">; defm SVLASTB : SVEPerm<"svlastb[_{d}]", "sPd", "aarch64_sve_lastb">; -def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">; -def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">; -def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">; -def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">; +def SVREV : SInst<"svrev[_{d}]", "dd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL : SInst<"svsel[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE : SInst<"svsplice[_{d}]", "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTBL : SInst<"svtbl[_{d}]", "ddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { - def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl">; + def SVTBL_BF16 : SInst<"svtbl[_{d}]", "ddu", "b", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>; } -def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1">; -def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2">; -def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi">; -def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi">; -def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo">; -def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo">; -def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1">; -def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2">; +def SVTRN1 : SInst<"svtrn1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2 : SInst<"svtrn2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUNPKHI_S : SInst<"svunpkhi[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpkhi", [IsStreamingCompatible]>; +def SVUNPKHI_U : SInst<"svunpkhi[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpkhi", [IsStreamingCompatible]>; +def SVUNPKLO_S : SInst<"svunpklo[_{d}]", "dh", "sil", MergeNone, "aarch64_sve_sunpklo", [IsStreamingCompatible]>; +def SVUNPKLO_U : SInst<"svunpklo[_{d}]", "dh", "UsUiUl", MergeNone, "aarch64_sve_uunpklo", [IsStreamingCompatible]>; +def SVUZP1 : SInst<"svuzp1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2 : SInst<"svuzp2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1 : SInst<"svzip1[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2 : SInst<"svzip2[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>; -def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev">; -def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel">; -def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice">; -def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1">; -def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2">; -def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1">; -def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2">; -def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1">; -def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2">; -} - -def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev">; -def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone]>; -def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone]>; -def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone]>; -def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel">; -def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1">; -def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone]>; -def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone]>; -def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone]>; -def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2">; -def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone]>; -def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone]>; -def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone]>; -def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi">; -def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo">; -def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1">; -def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone]>; -def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone]>; -def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone]>; -def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2">; -def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone]>; -def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone]>; -def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone]>; -def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1">; -def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone]>; -def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone]>; -def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone]>; -def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2">; -def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone]>; -def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone]>; -def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone]>; +def SVEXT_BF16 : SInst<"svext[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>; +def SVREV_BF16 : SInst<"svrev[_{d}]", "dd", "b", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVSEL_BF16 : SInst<"svsel[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>; +def SVTRN1_BF16 : SInst<"svtrn1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN2_BF16 : SInst<"svtrn2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVUZP1_BF16 : SInst<"svuzp1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP2_BF16 : SInst<"svuzp2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVZIP1_BF16 : SInst<"svzip1[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP2_BF16 : SInst<"svzip2[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +} + +def SVREV_B8 : SInst<"svrev_b8", "PP", "Pc", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>; +def SVREV_B16 : SInst<"svrev_b16", "PP", "Pc", MergeNone, "aarch64_sve_rev_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B32 : SInst<"svrev_b32", "PP", "Pc", MergeNone, "aarch64_sve_rev_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVREV_B64 : SInst<"svrev_b64", "PP", "Pc", MergeNone, "aarch64_sve_rev_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVSEL_B : SInst<"svsel[_b]", "PPPP", "Pc", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>; +def SVTRN1_B8 : SInst<"svtrn1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>; +def SVTRN1_B16 : SInst<"svtrn1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B32 : SInst<"svtrn1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN1_B64 : SInst<"svtrn1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B8 : SInst<"svtrn2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>; +def SVTRN2_B16 : SInst<"svtrn2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B32 : SInst<"svtrn2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVTRN2_B64 : SInst<"svtrn2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVPUNPKHI : SInst<"svunpkhi[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpkhi", [IsStreamingCompatible]>; +def SVPUNPKLO : SInst<"svunpklo[_b]", "PP", "Pc", MergeNone, "aarch64_sve_punpklo", [IsStreamingCompatible]>; +def SVUZP1_B8 : SInst<"svuzp1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>; +def SVUZP1_B16 : SInst<"svuzp1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B32 : SInst<"svuzp1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP1_B64 : SInst<"svuzp1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B8 : SInst<"svuzp2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>; +def SVUZP2_B16 : SInst<"svuzp2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B32 : SInst<"svuzp2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVUZP2_B64 : SInst<"svuzp2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B8 : SInst<"svzip1_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>; +def SVZIP1_B16 : SInst<"svzip1_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B32 : SInst<"svzip1_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP1_B64 : SInst<"svzip1_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B : SInst<"svzip2_b8", "PPP", "Pc", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>; +def SVZIP2_B16 : SInst<"svzip2_b16", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B32 : SInst<"svzip2_b32", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone, IsStreamingCompatible]>; +def SVZIP2_B64 : SInst<"svzip2_b64", "PPP", "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone, IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate creation -def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone]>; +def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; -def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">; -def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>; +let TargetGuard = "sve2p1|sme2" in { +def SVPFALSE_COUNT_ALIAS : SInst<"svpfalse_c", "}v", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>; +} -def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone>; -def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone>; -def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone>; -def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone>; -def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone>; +def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>; +def SVPTRUE : SInst<"svptrue_{d}", "Pv", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible, IsAppendSVALL]>; + +def SVDUPQ_B8 : SInst<"svdupq[_n]_{d}", "Pssssssssssssssss", "Pc", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B16 : SInst<"svdupq[_n]_{d}", "Pssssssss", "Ps", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B32 : SInst<"svdupq[_n]_{d}", "Pssss", "Pi", MergeNone, "", [IsStreamingCompatible]>; +def SVDUPQ_B64 : SInst<"svdupq[_n]_{d}", "Pss", "Pl", MergeNone, "", [IsStreamingCompatible]>; +def SVDUP_N_B : SInst<"svdup[_n]_{d}", "Ps", "PcPsPiPl", MergeNone, "", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Predicate operations -def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z">; -def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z">; -def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z">; -def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z">; -def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z">; -def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone>; // Uses custom expansion -def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z">; -def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z">; - -def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka">; -def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z">; -def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb">; -def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z">; -def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z">; -def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z">; -def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z">; - -def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst">; -def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext">; +def SVAND_B_Z : SInst<"svand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [IsStreamingCompatible]>; +def SVBIC_B_Z : SInst<"svbic[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [IsStreamingCompatible]>; +def SVEOR_B_Z : SInst<"sveor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [IsStreamingCompatible]>; +def SVMOV_B_Z : SInst<"svmov[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z", [IsStreamingCompatible]>; +def SVNOR_B_Z : SInst<"svnor[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z", [IsStreamingCompatible]>; +def SVNOT_B_Z : SInst<"svnot[_b]_z", "PPP", "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion +def SVORN_B_Z : SInst<"svorn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z", [IsStreamingCompatible]>; +def SVORR_B_Z : SInst<"svorr[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z", [IsStreamingCompatible]>; + +def SVBRKA : SInst<"svbrka[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brka", [IsStreamingCompatible]>; +def SVBRKA_Z : SInst<"svbrka[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brka_z", [IsStreamingCompatible]>; +def SVBRKB : SInst<"svbrkb[_b]_m", "PPPP", "Pc", MergeNone, "aarch64_sve_brkb", [IsStreamingCompatible]>; +def SVBRKB_Z : SInst<"svbrkb[_b]_z", "PPP", "Pc", MergeNone, "aarch64_sve_brkb_z", [IsStreamingCompatible]>; +def SVBRKN_Z : SInst<"svbrkn[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z", [IsStreamingCompatible]>; +def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z", [IsStreamingCompatible]>; +def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z", [IsStreamingCompatible]>; + +def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc", MergeNone, "aarch64_sve_pfirst", [IsStreamingCompatible]>; +def SVPNEXT : SInst<"svpnext_{d}", "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // Testing predicates -def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any">; -def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first">; -def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last">; +def SVPTEST_ANY : SInst<"svptest_any", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any", [IsStreamingCompatible]>; +def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first", [IsStreamingCompatible]>; +def SVPTEST_LAST : SInst<"svptest_last", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last", [IsStreamingCompatible]>; //////////////////////////////////////////////////////////////////////////////// // FFR manipulation @@ -1094,21 +1177,21 @@ //////////////////////////////////////////////////////////////////////////////// // Counting elements -def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone]>; -def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone]>; -def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone]>; -def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone]>; +def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone, IsStreamingCompatible]>; -def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone]>; -def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone]>; +def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; +def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>; -def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp">; -def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone>; +def SVCNTP : SInst<"svcntp_{d}", "nPP", "PcPsPiPl", MergeNone, "aarch64_sve_cntp", [IsStreamingCompatible]>; +def SVLEN : SInst<"svlen[_{d}]", "nd", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone>; +def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1125,20 +1208,20 @@ def UnsignedDoubleWord : sat_type<"U", "Ul">; multiclass SInst_SAT1 { - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } multiclass SInst_SAT2 { - def "" : SInst]>; - def _ALL : SInst]>; + def "" : SInst]>; + def _ALL : SInst]>; - def _N32 : SInst]>; - def _N64 : SInst]>; - def _N32_ALL : SInst]>; - def _N64_ALL : SInst]>; + def _N32 : SInst]>; + def _N64 : SInst]>; + def _N32_ALL : SInst]>; + def _N64_ALL : SInst]>; } defm SVQDECB_S : SInst_SAT1<"svqdecb", "aarch64_sve_sqdecb", SignedByte>; @@ -1159,32 +1242,32 @@ defm SVQINCD_S : SInst_SAT2<"svqincd", "aarch64_sve_sqincd", SignedDoubleWord>; defm SVQINCD_U : SInst_SAT2<"svqincd", "aarch64_sve_uqincd", UnsignedDoubleWord>; -def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp">; -def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp">; -def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp">; -def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp">; +def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqdecp", [IsStreamingCompatible]>; +def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp", [IsStreamingCompatible]>; +def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil", MergeNone, "aarch64_sve_sqincp", [IsStreamingCompatible]>; +def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp", [IsStreamingCompatible]>; -def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32">; -def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64">; -def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32">; -def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64">; -def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32">; -def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64">; -def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32">; -def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64">; +def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64", [IsStreamingCompatible]>; +def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32", [IsStreamingCompatible]>; +def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64", [IsStreamingCompatible]>; +def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [IsStreamingCompatible]>; +def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [IsStreamingCompatible]>; let TargetGuard = "sve,i8mm" in { def SVMLLA_S32 : SInst<"svmmla[_s32]", "ddqq","i", MergeNone, "aarch64_sve_smmla">; def SVMLLA_U32 : SInst<"svmmla[_u32]", "ddqq","Ui", MergeNone, "aarch64_sve_ummla">; def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i", MergeNone, "aarch64_sve_usmmla">; -def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot">; -def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot">; -def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; -def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>; +def SVUSDOT_S : SInst<"svusdot[_s32]", "ddbq", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>; +def SVSUDOT_S : SInst<"svsudot[_s32]", "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; +def SVSUDOT_N_S : SInst<"svsudot[_n_s32]", "ddq@", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>; -def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; -def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi", "i", MergeNone, "aarch64_sve_usdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; +def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarch64_sve_sudot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } let TargetGuard = "sve,f32mm" in { @@ -1195,74 +1278,87 @@ def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">; def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">; +def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q : SInst<"svuzp2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q : SInst<"svzip1q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q : SInst<"svzip2q[_{d}]", "ddd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } let TargetGuard = "sve,bf16,f64mm" in { def SVTRN1Q_BF16 : SInst<"svtrn1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q_BF16 : SInst<"svtrn2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_trn2q">; -def SVUZP1Q_BF16 : SInst<"svuzp1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1q">; -def SVUZP2Q_BF16 : SInst<"svuzp2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2q">; -def SVZIP1Q_BF16 : SInst<"svzip1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1q">; -def SVZIP2Q_BF16 : SInst<"svzip2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2q">; +def SVUZP1Q_BF16 : SInst<"svuzp1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>; +def SVUZP2Q_BF16 : SInst<"svuzp2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>; +def SVZIP1Q_BF16 : SInst<"svzip1q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>; +def SVZIP2Q_BF16 : SInst<"svzip2q[_{d}]", "ddd", "b", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Vector creation -def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; -def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>; +def SVUNDEF_1 : SInst<"svundef_{d}", "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>; -def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>; +def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; let TargetGuard = "sve,bf16" in { -def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef]>; -def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef]>; +def SVUNDEF_1_BF16 : SInst<"svundef_{d}", "dv", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; +def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>; + +def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; +} -def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsTupleCreate]>; -def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>; +let TargetGuard = "sme2" in { + def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; + def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // Vector insertion and extraction -def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; +def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; let TargetGuard = "sve,bf16" in { -def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; -def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>; -def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; +def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; -def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; -def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>; -def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; +def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; +def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>; +def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; +} + +let TargetGuard = "sme2" in { + def SVGET_2_B : SInst<"svget2[_{d}]", "d2i", "Pc", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; + def SVGET_4_B : SInst<"svget4[_{d}]", "d4i", "Pc", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; + + def SVSET_2_B : SInst<"svset2[_{d}]", "22id", "Pc", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>; + def SVSET_4_B : SInst<"svset4[_{d}]", "44id", "Pc", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 WhileGE/GT let TargetGuard = "sve2" in { -def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>; -def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>; -def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>; -def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; -def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>; +def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; +def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1304,49 +1400,49 @@ } let TargetGuard = "sve2" in { -defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl">; -defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl">; -defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl">; -defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl">; -defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl">; -defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl">; -defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd">; -defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd">; - -def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba">; -def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba">; -def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">; -def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh">; -def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh">; -def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah">; -def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh">; - -def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; - -def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; -def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; -def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqrshl", [IsStreamingCompatible]>; +defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [IsStreamingCompatible]>; +defm SVQSHL_S : SInstZPZxZ<"svqshl", "csil", "dPdx", "dPdK", "aarch64_sve_sqshl", [IsStreamingCompatible]>; +defm SVQSHL_U : SInstZPZxZ<"svqshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl", [IsStreamingCompatible]>; +defm SVRSHL_S : SInstZPZxZ<"svrshl", "csil", "dPdx", "dPdK", "aarch64_sve_srshl", [IsStreamingCompatible]>; +defm SVRSHL_U : SInstZPZxZ<"svrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl", [IsStreamingCompatible]>; +defm SVSQADD : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd", [IsStreamingCompatible]>; +defm SVUQADD : SInstZPZxZ<"svuqadd", "csil", "dPdu", "dPdL", "aarch64_sve_suqadd", [IsStreamingCompatible]>; + +def SVABA_S : SInst<"svaba[_{d}]", "dddd", "csil" , MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U : SInst<"svaba[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH : SInst<"svqdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH : SInst<"svqrdmulh[_{d}]", "ddd", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH : SInst<"svqrdmlah[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH : SInst<"svqrdmlsh[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVABA_S_N : SInst<"svaba[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>; +def SVABA_U_N : SInst<"svaba[_n_{d}]", "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>; +def SVQDMULH_N : SInst<"svqdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>; +def SVQRDMULH_N : SInst<"svqrdmulh[_n_{d}]", "dda", "csil", MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>; +def SVQRDMLAH_N : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>; +def SVQRDMLSH_N : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil", MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>; + +def SVQDMULH_LANE : SInst<"svqdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi", "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; + +def SVQSHLU_M : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeOp1, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_X : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeAny, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVQSHLU_Z : SInst<"svqshlu[_n_{d}]", "uPdi", "csil", MergeZero, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVRSHR_M_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeOp1, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_M_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeOp1, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeAny, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_X_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeAny, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]", "dPdi", "csil", MergeZero, "aarch64_sve_srshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]", "dPdi", "UcUsUiUl", MergeZero, "aarch64_sve_urshr", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_S : SInst<"svrsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_srsra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVRSRA_U : SInst<"svrsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_ursra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSLI : SInst<"svsli[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft, 1>]>; +def SVSRA_S : SInst<"svsra[_n_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_ssra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRA_U : SInst<"svsra[_n_{d}]", "dddi", "UcUsUiUl", MergeNone, "aarch64_sve_usra", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVSRI : SInst<"svsri[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1358,29 +1454,29 @@ } let TargetGuard = "sve2" in { -defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp">; -defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp">; -defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp">; -defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp">; -defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp">; -defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp">; -defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp">; -defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp">; -defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp">; -defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp">; +defm SVADDP : SInstPairwise<"svaddp", "csliUcUsUiUl", "aarch64_sve_addp", [IsStreamingCompatible]>; +defm SVADDP_F : SInstPairwise<"svaddp", "hfd", "aarch64_sve_faddp", [IsStreamingCompatible]>; +defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd", "aarch64_sve_fmaxnmp", [IsStreamingCompatible]>; +defm SVMAXP_F : SInstPairwise<"svmaxp", "hfd", "aarch64_sve_fmaxp", [IsStreamingCompatible]>; +defm SVMAXP_S : SInstPairwise<"svmaxp", "csli", "aarch64_sve_smaxp", [IsStreamingCompatible]>; +defm SVMAXP_U : SInstPairwise<"svmaxp", "UcUsUiUl", "aarch64_sve_umaxp", [IsStreamingCompatible]>; +defm SVMINNMP : SInstPairwise<"svminnmp", "hfd", "aarch64_sve_fminnmp", [IsStreamingCompatible]>; +defm SVMINP_F : SInstPairwise<"svminp", "hfd", "aarch64_sve_fminp", [IsStreamingCompatible]>; +defm SVMINP_S : SInstPairwise<"svminp", "csli", "aarch64_sve_sminp", [IsStreamingCompatible]>; +defm SVMINP_U : SInstPairwise<"svminp", "UcUsUiUl", "aarch64_sve_uminp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Widening pairwise arithmetic let TargetGuard = "sve2" in { -def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp">; -def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp">; -def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp">; +def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny, "aarch64_sve_sadalp", [IsStreamingCompatible]>; +def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp", [IsStreamingCompatible]>; -def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp">; -def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp">; -def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp">; +def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny, "aarch64_sve_uadalp", [IsStreamingCompatible]>; +def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1388,56 +1484,56 @@ // let TargetGuard = "sve2" in { -def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; - -def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">; -def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">; -def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">; -def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">; -def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">; -def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">; -def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>; +def SVBCAX : SInst<"svbcax[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL : SInst<"svbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3 : SInst<"sveor3[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL : SInst<"svnbsl[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; + +def SVBCAX_N : SInst<"svbcax[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>; +def SVBSL_N : SInst<"svbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>; +def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>; +def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>; +def SVEOR3_N : SInst<"sveor3[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>; +def SVNBSL_N : SInst<"svnbsl[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>; +def SVXAR_N : SInst<"svxar[_n_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Large integer arithmetic let TargetGuard = "sve2" in { -def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; -def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb">; -def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt">; -def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb">; -def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt">; +def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>; +def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>; +def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>; +def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Multiplication by indexed elements let TargetGuard = "sve2" in { -def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi", "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Uniform complex integer arithmetic let TargetGuard = "sve2" in { -def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [], [ImmCheck<2, ImmCheckComplexRot90_270>]>; -def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVCADD : SInst<"svcadd[_{d}]", "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVSQCADD : SInst<"svqcadd[_{d}]", "dddi", "csil", MergeNone, "aarch64_sve_sqcadd_x", [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>; +def SVCMLA : SInst<"svcmla[_{d}]", "ddddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCMLA_LANE_X : SInst<"svcmla_lane[_{d}]", "ddddii", "siUsUi", MergeNone, "aarch64_sve_cmla_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, +def SVSQRDCMLAH_X : SInst<"svqrdcmlah[_{d}]", "ddddi", "csil", MergeNone, "aarch64_sve_sqrdcmlah_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si", MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>, ImmCheck<4, ImmCheckComplexRotAll90>]>; } @@ -1445,18 +1541,18 @@ // SVE2 - Widening DSP operations multiclass SInstWideDSPAcc { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPLong { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } multiclass SInstWideDSPWide { - def : SInst; - def _N : SInst; + def : SInst; + def _N : SInst; } let TargetGuard = "sve2" in { @@ -1505,87 +1601,87 @@ defm SVSUBWT_S : SInstWideDSPWide<"svsubwt", "sil", "aarch64_sve_ssubwt">; defm SVSUBWT_U : SInstWideDSPWide<"svsubwt", "UsUiUl", "aarch64_sve_usubwt">; -def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; -def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; - -def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone>; -def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone>; -def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone>; - -def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; -def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil", MergeNone, "aarch64_sve_sshllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; +def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft, 0>]>; + +def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh", "sil", MergeNone, "", [IsStreamingCompatible]>; +def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh", "UsUiUl", MergeNone, "", [IsStreamingCompatible]>; + +def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_smlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_smullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]", "dhhi", "UiUl", MergeNone, "aarch64_sve_umullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il", MergeNone, "aarch64_sve_sqdmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; +def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi", "il", MergeNone, "aarch64_sve_sqdmullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Narrowing DSP operations let TargetGuard = "sve2" in { -def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb">; -def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">; -def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">; -def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">; -def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">; -def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">; -def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb">; -def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">; - -def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; -def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; - -def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; -def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVADDHNB : SInst<"svaddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT : SInst<"svaddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB : SInst<"svraddhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT : SInst<"svraddhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB : SInst<"svrsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT : SInst<"svrsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB : SInst<"svsubhnb[_{d}]", "hdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT : SInst<"svsubhnt[_{d}]", "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVADDHNB_N : SInst<"svaddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>; +def SVADDHNT_N : SInst<"svaddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>; +def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>; +def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>; +def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>; +def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>; +def SVSUBHNB_N : SInst<"svsubhnb[_n_{d}]", "hda", "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>; +def SVSUBHNT_N : SInst<"svsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>; + +def SVSHRNB : SInst<"svshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVRSHRNB : SInst<"svrshrnb[_n_{d}]", "hdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRUNB : SInst<"svqshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRUNB : SInst<"svqrshrunb[_n_{d}]", "edi", "sil", MergeNone, "aarch64_sve_sqrshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_S : SInst<"svqshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQSHRNB_U : SInst<"svqshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_S : SInst<"svqrshrnb[_n_{d}]", "hdi", "sil", MergeNone, "aarch64_sve_sqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; +def SVQRSHRNB_U : SInst<"svqrshrnb[_n_{d}]", "hdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>; + +def SVSHRNT : SInst<"svshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVRSHRNT : SInst<"svrshrnt[_n_{d}]", "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRUNT : SInst<"svqshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRUNT : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil", MergeNone, "aarch64_sve_sqrshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_S : SInst<"svqshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQSHRNT_U : SInst<"svqshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_S : SInst<"svqrshrnt[_n_{d}]", "hhdi", "sil", MergeNone, "aarch64_sve_sqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; +def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]", "hhdi", "UsUiUl", MergeNone, "aarch64_sve_uqrshrnt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Unary narrowing operations let TargetGuard = "sve2" in { -def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb">; -def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb">; -def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb">; +def SVQXTNB_S : SInst<"svqxtnb[_{d}]", "hd", "sil", MergeNone, "aarch64_sve_sqxtnb", [IsStreamingCompatible]>; +def SVQXTNB_U : SInst<"svqxtnb[_{d}]", "hd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnb", [IsStreamingCompatible]>; +def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed", "sil", MergeNone, "aarch64_sve_sqxtunb", [IsStreamingCompatible]>; -def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt">; -def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt">; -def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt">; +def SVQXTNT_S : SInst<"svqxtnt[_{d}]", "hhd", "sil", MergeNone, "aarch64_sve_sqxtnt", [IsStreamingCompatible]>; +def SVQXTNT_U : SInst<"svqxtnt[_{d}]", "hhd", "UsUiUl", MergeNone, "aarch64_sve_uqxtnt", [IsStreamingCompatible]>; +def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil", MergeNone, "aarch64_sve_sqxtunt", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1726,28 +1822,28 @@ // SVE2 - Polynomial arithmetic let TargetGuard = "sve2" in { -def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">; -def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">; -def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul">; -def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair">; -def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone>; -def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone>; -def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; -def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair">; +def SVEORBT : SInst<"sveorbt[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORBT_N : SInst<"sveorbt[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>; +def SVEORTB : SInst<"sveortb[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVEORTB_N : SInst<"sveortb[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>; +def SVPMUL : SInst<"svpmul[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMUL_N : SInst<"svpmul[_n_{d}]", "dda", "Uc", MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>; +def SVPMULLB : SInst<"svpmullb[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_N : SInst<"svpmullb[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLB_PAIR : SInst<"svpmullb_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>; +def SVPMULLT : SInst<"svpmullt[_{d}]", "dhh", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_N : SInst<"svpmullt[_n_{d}]", "dhR", "UsUl", MergeNone, "", [IsStreamingCompatible]>; +def SVPMULLT_PAIR : SInst<"svpmullt_pair[_{d}]", "ddd", "UcUi", MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>; +def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda", "UcUi", MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Complex integer dot product let TargetGuard = "sve2" in { -def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [], [ImmCheck<3, ImmCheckComplexRotAll90>]>; -def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [], [ImmCheck<4, ImmCheckComplexRotAll90>, +def SVCDOT : SInst<"svcdot[_{d}]", "ddqqi", "il", MergeNone, "aarch64_sve_cdot", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>; +def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il", MergeNone, "aarch64_sve_cdot_lane", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>, ImmCheck<3, ImmCheckLaneIndexDot, 2>]>; } @@ -1755,27 +1851,27 @@ // SVE2 - Floating-point widening multiply-accumulate let TargetGuard = "sve2" in { -def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb">; -def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt">; -def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb">; -def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; -def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt">; -def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALB_F : SInst<"svmlalb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_N : SInst<"svmlalb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>; +def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLALT_F : SInst<"svmlalt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_N : SInst<"svmlalt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>; +def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLB_F : SInst<"svmlslb[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_N : SInst<"svmlslb[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>; +def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; +def SVMLSLT_F : SInst<"svmlslt[_{d}]", "ddhh", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_N : SInst<"svmlslt[_n_{d}]", "ddhR", "f", MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>; +def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Floating-point integer binary logarithm let TargetGuard = "sve2" in { -def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb">; -def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb">; -def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb">; +def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeAnyExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; +def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd", "hfd", MergeZeroExp, "aarch64_sve_flogb", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1797,32 +1893,32 @@ //////////////////////////////////////////////////////////////////////////////// // SVE2 - Contiguous conflict detection let TargetGuard = "sve2" in { -def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW]>; -def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW]>; -def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW]>; +def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, IsStreamingCompatible]>; -def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW]>; -def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; -def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW]>; -def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW]>; +def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc", MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>; -def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>; +def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>; +def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// // SVE2 - Extended table lookup/permute let TargetGuard = "sve2" in { -def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone>; -def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx">; +def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u", "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX : SInst<"svtbx[_{d}]", "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } let TargetGuard = "sve2,bf16" in { -def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone>; -def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx">; +def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [IsStreamingCompatible]>; +def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>; } //////////////////////////////////////////////////////////////////////////////// @@ -1859,12 +1955,430 @@ def SVBGRP_N : SInst<"svbgrp[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x">; } -let TargetGuard = "sve2p1" in { -def SVFCLAMP : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [], []>; -def SVPTRUE_COUNT : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsOverloadNone], []>; +//////////////////////////////////////////////////////////////////////////////// +// SME + +let TargetGuard = "sve2p1|sme" in { +def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sclamp", [IsStreamingCompatible], []>; +def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [IsStreamingCompatible], []>; +defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">; + +def SVPSEL_B : SInst<"svpsel_lane_b8", "PPPmi", "Pc", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_15>]>; +def SVPSEL_H : SInst<"svpsel_lane_b16", "PPPmi", "Ps", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_7>]>; +def SVPSEL_S : SInst<"svpsel_lane_b32", "PPPmi", "Pi", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_3>]>; +def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPmi", "Pl", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_1>]>; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVPSEL_COUNT_ALIAS_B : SInst<"svpsel_lane_c8", "}}Pmi", "Pc", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_15>]>; +def SVPSEL_COUNT_ALIAS_H : SInst<"svpsel_lane_c16", "}}Pmi", "Ps", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_7>]>; +def SVPSEL_COUNT_ALIAS_S : SInst<"svpsel_lane_c32", "}}Pmi", "Pi", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_3>]>; +def SVPSEL_COUNT_ALIAS_D : SInst<"svpsel_lane_c64", "}}Pmi", "Pl", MergeNone, "", [IsStreamingCompatible, IsZASliceBaseOffsetIntr], [ImmCheck<3, ImmCheck0_1>]>; +} + +let TargetGuard = "sme2" in { +def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], [], MemEltTyDefault>; +def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], [], MemEltTyDefault>; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVPTRUE_COUNT : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_sve_ptrue_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], []>; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_3>]>; +def SVPEXT_X2 : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [IsStreamingOrSVE2p1], [ImmCheck<1, ImmCheck0_1>]>; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<1, ImmCheck2_4_Mul2>]>; +} + +let TargetGuard = "sve2p1|sme2" in { + // Contiguous (load)/(non-temporal load) to multi-vector. + def SVLD1B_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1H_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1W_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1D_VG2 : MInst<"svld1[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1B_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1H_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1W_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1D_VG4 : MInst<"svld1[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + + def SVLDNT1B_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1H_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1W_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1D_VG2 : MInst<"svldnt1[_{2}]_x2", "2}c", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1B_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1H_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1W_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1D_VG4 : MInst<"svldnt1[_{2}]_x4", "4}c", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + + def SVLD1B_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1H_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1W_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1D_VNUM_VG2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">; + def SVLD1B_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1H_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1W_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + def SVLD1D_VNUM_VG4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">; + + def SVLDNT1B_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1H_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1W_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1D_VNUM_VG2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">; + def SVLDNT1B_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1H_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1W_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; + def SVLDNT1D_VNUM_VG4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVST1B_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1H_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1W_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1D_VG2 : MInst<"svst1[_{2}_x2]", "v}p2", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1B_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1H_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1W_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1D_VG4 : MInst<"svst1[_{2}_x4]", "v}p4", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; + +def SVST1B_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1H_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1W_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1D_VNUM_VG2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x2">; +def SVST1B_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1H_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1W_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +def SVST1D_VNUM_VG4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_st1_pn_x4">; +} + +let TargetGuard = "sve2p1|sme2" in { +def SVSTNT1B_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1H_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1W_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1D_VG2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1B_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1H_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1W_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1D_VG4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; + +def SVSTNT1B_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1H_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1W_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1D_VNUM_VG2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">; +def SVSTNT1B_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1H_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1W_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; +def SVSTNT1D_VNUM_VG4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore, IsStreamingOrSVE2p1], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">; + +def SVWHILEGE_COUNT : SInst<"svwhilege_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilege_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEGT_COUNT : SInst<"svwhilegt_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilegt_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELE_COUNT : SInst<"svwhilele_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilele_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELT_COUNT : SInst<"svwhilelt_{d}", "}lli", "QcQsQiQl", MergeNone, "aarch64_sve_whilelt_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELO_COUNT : SInst<"svwhilelo_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilelo_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILELS_COUNT : SInst<"svwhilels_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEHI_COUNT : SInst<"svwhilehi_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEHS_COUNT : SInst<"svwhilehs_{d}", "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsStreamingOrSVE2p1, IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>; +def SVWHILEGE_X2 : SInst<"svwhilege_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege_x2", [IsStreamingCompatible], []>; +def SVWHILEGT_X2 : SInst<"svwhilegt_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt_x2", [IsStreamingCompatible], []>; +def SVWHILELE_X2 : SInst<"svwhilele_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilele_x2", [IsStreamingCompatible], []>; +def SVWHILELT_X2 : SInst<"svwhilelt_{d}_x2", "2ll", "PcPsPiPl", MergeNone, "aarch64_sve_whilelt_x2", [IsStreamingCompatible], []>; +def SVWHILELO_X2 : SInst<"svwhilelo_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilelo_x2", [IsStreamingCompatible], []>; +def SVWHILELS_X2 : SInst<"svwhilels_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilels_x2", [IsStreamingCompatible], []>; +def SVWHILEHI_X2 : SInst<"svwhilehi_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilehi_x2", [IsStreamingCompatible], []>; +def SVWHILEHS_X2 : SInst<"svwhilehs_{d}_x2", "2nn", "PcPsPiPl", MergeNone, "aarch64_sve_whilehs_x2", [IsStreamingCompatible], []>; +} + +//////////////////////////////////////////////////////////////////////////////// +// SME2 + +let TargetGuard = "sme2" in { + // SRSHL / URSHL + def SVSRSHL_SINGLE_X2 : SInst<"svrshl[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_srshl_single_x2", [IsStreaming], []>; + def SVURSHL_SINGLE_X2 : SInst<"svrshl[_single_{d}_x2]", "22d", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_single_x2", [IsStreaming], []>; + def SVSRSHL_SINGLE_X4 : SInst<"svrshl[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_srshl_single_x4", [IsStreaming], []>; + def SVURSHL_SINGLE_X4 : SInst<"svrshl[_single_{d}_x4]", "44d", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_single_x4", [IsStreaming], []>; + + def SVSRSHL_X2 : SInst<"svrshl[_{d}_x2]", "222", "csil", MergeNone, "aarch64_sve_srshl_x2", [IsStreaming], []>; + def SVURSHL_X2 : SInst<"svrshl[_{d}_x2]", "222", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_x2", [IsStreaming], []>; + def SVSRSHL_X4 : SInst<"svrshl[_{d}_x4]", "444", "csil", MergeNone, "aarch64_sve_srshl_x4", [IsStreaming], []>; + def SVURSHL_X4 : SInst<"svrshl[_{d}_x4]", "444", "UcUsUiUl", MergeNone, "aarch64_sve_urshl_x4", [IsStreaming], []>; + + // SQRSHR / UQRSHR + def SVQRSHR_X2 : SInst<"svqrshr[_{0}_x2]", "h2i", "i", MergeNone, "aarch64_sve_sqrshr_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVUQRSHR_X2 : SInst<"svqrshr[_{0}_x2]", "e2i", "Ui", MergeNone, "aarch64_sve_uqrshr_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVQRSHR_X4 : SInst<"svqrshr[_{0}_x4]", "q4i", "il", MergeNone, "aarch64_sve_sqrshr_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + def SVUQRSHR_X4 : SInst<"svqrshr[_{0}_x4]", "b4i", "UiUl", MergeNone, "aarch64_sve_uqrshr_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; +} + + // SQRSHRN / UQRSHRN +let TargetGuard = "sve2p1|sme2" in { + def SVQRSHRN_X2 : SInst<"svqrshrn[_{0}_x2]", "h2i", "i", MergeNone, "aarch64_sve_sqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; + def SVUQRSHRN_X2 : SInst<"svqrshrn[_{0}_x2]", "e2i", "Ui", MergeNone, "aarch64_sve_uqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; +} + +let TargetGuard = "sme2" in { + def SVQRSHRN_X4 : SInst<"svqrshrn[_{0}_x4]", "q4i", "il", MergeNone, "aarch64_sve_sqrshrn_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + def SVUQRSHRN_X4 : SInst<"svqrshrn[_{0}_x4]", "b4i", "UiUl", MergeNone, "aarch64_sve_uqrshrn_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + // SQRSHRU + def SVSQRSHRU_X2 : SInst<"svsqrshru[_{0}_x2]", "e2i", "i", MergeNone, "aarch64_sve_sqrshru_x2", [IsStreaming], [ImmCheck<1, ImmCheck1_16>]>; + def SVSQRSHRU_X4 : SInst<"svsqrshru[_{0}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshru_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; +} + +let TargetGuard = "sve2p1|sme2" in { + // SQRSHRUN + def SVSQRSHRUN_X2 : SInst<"svsqrshrun[_{0}_x2]", "e2i", "i", MergeNone, "aarch64_sve_sqrshrun_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; +} + +let TargetGuard = "sve2p1|sme2" in { + def SVSQRSHRUN_X4 : SInst<"svsqrshrun[_{0}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; + + def SVZIP_X2 : SInst<"svzip[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x2", [IsStreamingOrSVE2p1], []>; + def SVZIPQ_X2 : SInst<"svzipq[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq_x2", [IsStreamingOrSVE2p1], []>; + def SVZIP_X4 : SInst<"svzip[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zip_x4", [IsStreamingOrSVE2p1], []>; + def SVZIPQ_X4 : SInst<"svzipq[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq_x4", [IsStreamingOrSVE2p1], []>; + + def SVUZP_X2 : SInst<"svuzp[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzp_x2", [IsStreamingOrSVE2p1], []>; + def SVUZPQ_X2 : SInst<"svuzpq[_{d}]_x2", "2dd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq_x2", [IsStreamingOrSVE2p1], []>; + def SVUZP_X4 : SInst<"svuzp[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzp_x4", [IsStreamingOrSVE2p1], []>; + def SVUZPQ_X4 : SInst<"svuzpq[_{d}]_x4", "4dddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq_x4", [IsStreamingOrSVE2p1], []>; + + // SQDMULH + def SVSQDMULH_SINGLE_X2 : SInst<"svsqdmulh[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx2", [IsStreamingOrSVE2p1], []>; + def SVSQDMULH_SINGLE_X4 : SInst<"svsqdmulh[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx4", [IsStreamingOrSVE2p1], []>; + def SVSQDMULH_X2 : SInst<"svsqdmulh[_{d}_x2]", "222", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx2", [IsStreamingOrSVE2p1], []>; + def SVSQDMULH_X4 : SInst<"svsqdmulh[_{d}_x4]", "444", "csil", MergeNone, "aarch64_sve_sqdmulh_vgx4", [IsStreamingOrSVE2p1], []>; +} + +let TargetGuard = "sve2p1|sme2" in { + // 2-way dot products + def SVDOT_X2_S : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "i", MergeNone, "aarch64_sve_sdot_x2", [IsStreamingCompatible], []>; + def SVDOT_X2_U : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "Ui", MergeNone, "aarch64_sve_udot_x2", [IsStreamingCompatible], []>; + def SVDOT_X2_F : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "f", MergeNone, "aarch64_sve_fdot_x2", [IsStreamingCompatible], []>; + def SVDOT_LANE_X2_S : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "i", MergeNone, "aarch64_sve_sdot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "Ui", MergeNone, "aarch64_sve_udot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; + def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "f", MergeNone, "aarch64_sve_fdot_lane_x2", [IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>; +} + +let TargetGuard = "sme2" in { + // 2-vector and 4-vector signed and unsigned unpacks + def SVSUNPK_X2 : SInst<"svunpk[_{d}_x2]", "2h", "sil", MergeNone, "aarch64_sve_sunpk_x2", [IsStreaming], []>; + def SVUUNPK_X2 : SInst<"svunpk[_{d}_x2]", "2h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x2", [IsStreaming], []>; + def SVSUNPK_X4 : SInst<"svunpk[_{d}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>; + def SVUUNPK_X4 : SInst<"svunpk[_{d}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; +} +let TargetGuard = "sve2p1|sme2" in { + // CLAMPS + def SVFCLAMP : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [IsStreamingCompatible], []>; +} + +let TargetGuard = "sme2" in { + def SVSCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "csil", MergeNone, "aarch64_sve_sclamp_single_x2", [IsStreaming], []>; + def SVUCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x2", [IsStreaming], []>; + def SVFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x2", [IsStreaming], []>; + + def SVSCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "csil", MergeNone, "aarch64_sve_sclamp_single_x4", [IsStreaming], []>; + def SVUCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x4", [IsStreaming], []>; + def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x4", [IsStreaming], []>; + + // 2-way and 4-way selects + def SVSEL_X2 : SInst<"svsel[_{d}_x2]", "2}22", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_sel_x2", [IsStreaming], []>; + def SVSEL_X4 : SInst<"svsel[_{d}_x4]", "4}44", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_sel_x4", [IsStreaming], []>; +} + +let TargetGuard = "sve2p1,b16b16" in { + def SVFCLAMP_BF : SInst<"svclamp[_{d}]", "dddd", "b", MergeNone, "aarch64_sve_fclamp", [IsStreamingCompatible], []>; +} + +multiclass MinMaxIntr { + def SVS # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "csil", MergeNone, "aarch64_sve_s" # i # zm # "_" # mul, [IsStreaming], []>; + def SVU # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "UcUsUiUl", MergeNone, "aarch64_sve_u" # i # zm # "_" # mul, [IsStreaming], []>; + def SVF # NAME : SInst<"sv" # i # "[" # zm # "_{d}_" # mul # "]", t, "hfd", MergeNone, "aarch64_sve_f" # i # zm # "_" # mul, [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { + // SMAX / UMAX / FMAX + defm MAX_SINGLE_X2 : MinMaxIntr<"max", "_single", "x2", "22d">; + defm MAX_MULTI_X2 : MinMaxIntr<"max", "", "x2", "222">; + defm MAX_SINGLE_X4 : MinMaxIntr<"max", "_single", "x4", "44d">; + defm MAX_MULTI_X4 : MinMaxIntr<"max", "", "x4", "444">; + + // SMIN / UMIN / FMIN + defm MIN_SINGLE_X2 : MinMaxIntr<"min", "_single", "x2", "22d">; + defm MIN_MULTI_X2 : MinMaxIntr<"min", "", "x2", "222">; + defm MIN_SINGLE_X4 : MinMaxIntr<"min", "_single", "x4", "44d">; + defm MIN_MULTI_X4 : MinMaxIntr<"min", "", "x4", "444">; +} + +let TargetGuard = "sme2" in { + // FMINNM / FMAXNM + def SVMAXNM_SINGLE_X2 : SInst<"svmaxnm[_single_{d}_x2]", "22d", "hfd", MergeNone, "aarch64_sve_fmaxnm_single_x2", [IsStreaming], []>; + def SVMAXNM_SINGLE_X4 : SInst<"svmaxnm[_single_{d}_x4]", "44d", "hfd", MergeNone, "aarch64_sve_fmaxnm_single_x4", [IsStreaming], []>; + + def SVMAXNM_X2 : SInst<"svmaxnm[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fmaxnm_x2", [IsStreaming], []>; + def SVMAXNM_X4 : SInst<"svmaxnm[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fmaxnm_x4", [IsStreaming], []>; + + def SVMINNM_SINGLE_X2 : SInst<"svminnm[_single_{d}_x2]", "22d", "hfd", MergeNone, "aarch64_sve_fminnm_single_x2", [IsStreaming], []>; + def SVMINNM_SINGLE_X4 : SInst<"svminnm[_single_{d}_x4]", "44d", "hfd", MergeNone, "aarch64_sve_fminnm_single_x4", [IsStreaming], []>; + + def SVMINNM_X2 : SInst<"svminnm[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fminnm_x2", [IsStreaming], []>; + def SVMINNM_X4 : SInst<"svminnm[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fminnm_x4", [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { + // FRINTA / FRINTM / FRINTN / FRINTP + def SVRINTA_X2 : SInst<"svrinta[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frinta_x2", [IsStreaming], []>; + def SVRINTA_X4 : SInst<"svrinta[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frinta_x4", [IsStreaming], []>; + + def SVRINTM_X2 : SInst<"svrintm[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintm_x2", [IsStreaming], []>; + def SVRINTM_X4 : SInst<"svrintm[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintm_x4", [IsStreaming], []>; + + def SVRINTN_X2 : SInst<"svrintn[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintn_x2", [IsStreaming], []>; + def SVRINTN_X4 : SInst<"svrintn[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintn_x4", [IsStreaming], []>; + + def SVRINTP_X2 : SInst<"svrintp[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintp_x2", [IsStreaming], []>; + def SVRINTP_X4 : SInst<"svrintp[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintp_x4", [IsStreaming], []>; +} + +let TargetGuard = "sme2" in { +// == ADD (vectors) == + def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>; + def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>; +} + +// +// Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16 +// +let TargetGuard = "sme2" in { + def SVCVTN_F16_X2 : SInst<"svcvtn_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvtn_x2", [IsStreaming],[]>; + def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>; +} + +// +// Multi-vector convert to/from floating-point. +// +let TargetGuard = "sme2" in { + def SVCVT_F16_X2 : SInst<"svcvt_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvt_x2", [IsStreaming],[]>; + def SVCVT_BF16_X2 : SInst<"svcvt_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvt_x2", [IsOverloadNone, IsStreaming],[]>; +} + +// +// Multi-vector convert to/from floating-point. +// +let TargetGuard = "sme2" in { + def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f", MergeNone, "aarch64_sve_fcvtu_x2", [IsStreaming], []>; + def SVCVT_U32_F32_X2 : SInst<"svcvt_u32[_{d}_x2]", "2.u2.d", "f", MergeNone, "aarch64_sve_ucvtf_x2", [IsStreaming], []>; + def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f", MergeNone, "aarch64_sve_fcvts_x2", [IsStreaming], []>; + def SVCVT_S32_F32_X2 : SInst<"svcvt_s32[_{d}_x2]", "2.x2.d", "f", MergeNone, "aarch64_sve_scvtf_x2", [IsStreaming], []>; + + def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f", MergeNone, "aarch64_sve_fcvtu_x4", [IsStreaming], []>; + def SVCVT_U32_F32_X4 : SInst<"svcvt_u32[_{d}_x4]", "4.u4.d", "f", MergeNone, "aarch64_sve_ucvtf_x4", [IsStreaming], []>; + def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f", MergeNone, "aarch64_sve_fcvts_x4", [IsStreaming], []>; + def SVCVT_S32_F32_X4 : SInst<"svcvt_s32[_{d}_x4]", "4.x4.d", "f", MergeNone, "aarch64_sve_scvtf_x4", [IsStreaming], []>; +} + +// BFloat16 floating-point multiply-subtract long from single-precision. + +let TargetGuard = "sve2p1|sme2" in { + def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone, IsStreamingCompatible], []>; + def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone, IsStreamingCompatible], []>; + + def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>; +} + +// +// Multi-vector saturating extract narrow +// +let TargetGuard = "sme2" in { + def SVQCVT_S16_S32_X2 : SInst<"svqcvt_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvt_x2", [IsStreaming], []>; + def SVQCVT_U16_U32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvt_x2", [IsStreaming], []>; + def SVQCVT_U16_S32_X2 : SInst<"svqcvt_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtu_x2", [IsStreaming], []>; + + def SVQCVT_S8_S32_X4 : SInst<"svqcvt_s8[_{d}_x4]", "q4.d", "i", MergeNone, "aarch64_sve_sqcvt_x4", [IsStreaming], []>; + def SVQCVT_U8_U32_X4 : SInst<"svqcvt_u8[_{d}_x4]", "b4.d", "Ui", MergeNone, "aarch64_sve_uqcvt_x4", [IsStreaming], []>; + def SVQCVT_U8_S32_X4 : SInst<"svqcvt_u8[_{d}_x4]", "b4.d", "i", MergeNone, "aarch64_sve_sqcvtu_x4", [IsStreaming], []>; + + def SVQCVT_S16_S64_X4 : SInst<"svqcvt_s16[_{d}_x4]", "q4.d", "l", MergeNone, "aarch64_sve_sqcvt_x4", [IsStreaming], []>; + def SVQCVT_U16_U64_X4 : SInst<"svqcvt_u16[_{d}_x4]", "b4.d", "Ul", MergeNone, "aarch64_sve_uqcvt_x4", [IsStreaming], []>; + def SVQCVT_U16_S64_X4 : SInst<"svqcvt_u16[_{d}_x4]", "b4.d", "l", MergeNone, "aarch64_sve_sqcvtu_x4", [IsStreaming], []>; +} + +// +// Multi-vector saturating extract narrow and interleave +// +let TargetGuard = "sve2p1|sme2" in { + def SVQCVTN_S16_S32_X2 : SInst<"svqcvtn_s16[_{d}_x2]", "h2.d", "i", MergeNone, "aarch64_sve_sqcvtn_x2", [IsStreamingCompatible], []>; + def SVQCVTN_U16_U32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x2", [IsStreamingCompatible], []>; + def SVQCVTN_U16_S32_X2 : SInst<"svqcvtn_u16[_{d}_x2]", "e2.d", "i", MergeNone, "aarch64_sve_sqcvtun_x2", [IsStreamingCompatible], []>; +} + +let TargetGuard = "sme2" in { + def SVQCVTN_S8_S32_X4 : SInst<"svqcvtn_s8[_{d}_x4]", "q4.d", "i", MergeNone, "aarch64_sve_sqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U8_U32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "Ui", MergeNone, "aarch64_sve_uqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U8_S32_X4 : SInst<"svqcvtn_u8[_{d}_x4]", "b4.d", "i", MergeNone, "aarch64_sve_sqcvtun_x4", [IsStreaming], []>; + + def SVQCVTN_S16_S64_X4 : SInst<"svqcvtn_s16[_{d}_x4]", "q4.d", "l", MergeNone, "aarch64_sve_sqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U16_U64_X4 : SInst<"svqcvtn_u16[_{d}_x4]", "b4.d", "Ul", MergeNone, "aarch64_sve_uqcvtn_x4", [IsStreaming], []>; + def SVQCVTN_U16_S64_X4 : SInst<"svqcvtn_u16[_{d}_x4]", "b4.d", "l", MergeNone, "aarch64_sve_sqcvtun_x4", [IsStreaming], []>; } let TargetGuard = "sve2p1" in { -def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil", MergeNone, "aarch64_sve_sclamp", [], []>; -def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>; + // ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 + def SVZIPQ1 : SInst<"svzipq1[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq1", [], []>; + def SVZIPQ2 : SInst<"svzipq2[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_zipq2", [], []>; + def SVUZPQ1 : SInst<"svuzpq1[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq1", [], []>; + def SVUZPQ2 : SInst<"svuzpq2[_{d}]", "ddd", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_uzpq2", [], []>; + // TBLQ, TBXQ + def SVTBLQ : SInst<"svtblq[_{d}]", "ddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tblq">; + def SVTBXQ : SInst<"svtbxq[_{d}]", "dddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tbxq">; + // EXTQ + def EXTQ : SInst<"svextq_lane[_{d}]", "dddk", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_extq_lane", [], [ImmCheck<2, ImmCheck0_15>]>; + + // Load N-element structure into N vectors (scalar base) + defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">; + defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">; + defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">; + + // Load N-element structure into N vectors (scalar base, VL displacement) + defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">; + defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">; + defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">; + + // Store N vectors into N-element structure (scalar base) + defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">; + defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">; + defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">; + + // Store N vectors into N-element structure (scalar base, VL displacement) + defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">; + defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">; + defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">; + + // PMOV + // Move to Pred + def SVPMOV_B_TO_PRED_LANE : SInst<"svpmov_lane[_{d}]", "Pdk", "cUc", MergeNone, "aarch64_sve_pmov_to_pred_lane", [], [ImmCheck<1, ImmCheck0_0>]>; + def SVPMOV_H_TO_PRED_LANE : SInst<"svpmov_lane[_{d}]", "Pdk", "sUs", MergeNone, "aarch64_sve_pmov_to_pred_lane", [], [ImmCheck<1, ImmCheck0_1>]>; + def SVPMOV_S_TO_PRED_LANE : SInst<"svpmov_lane[_{d}]", "Pdk", "iUi", MergeNone, "aarch64_sve_pmov_to_pred_lane", [], [ImmCheck<1, ImmCheck0_3>]>; + def SVPMOV_D_TO_PRED_LANE : SInst<"svpmov_lane[_{d}]", "Pdk", "lUl", MergeNone, "aarch64_sve_pmov_to_pred_lane", [], [ImmCheck<1, ImmCheck0_7>]>; + + // Move to Vector + multiclass PMOV_TO_VEC flags=[], ImmCheckType immCh > { + def _M : SInst]>; + def _Z : SInst; + } + def SVPMOV_TO_VEC_LANE_B : SInst<"svpmov_lane_{d}_z", "dP", "cUc", MergeNone, "aarch64_sve_pmov_to_vector_lane_zeroing", [], []>; + defm SVPMOV_TO_VEC_LANE_H : PMOV_TO_VEC<"svpmov_lane", "sUs", "aarch64_sve_pmov_to_vector_lane", [], ImmCheck1_1>; + defm SVPMOV_TO_VEC_LANE_S : PMOV_TO_VEC<"svpmov_lane", "iUi", "aarch64_sve_pmov_to_vector_lane", [], ImmCheck1_3>; + defm SVPMOV_TO_VEC_LANE_D : PMOV_TO_VEC<"svpmov_lane", "lUl", "aarch64_sve_pmov_to_vector_lane" ,[], ImmCheck1_7>; + + // Load one vector (vector base + scalar offset) + def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">; + def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">; + + // Store one vector (vector base + scalar offset) + def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">; + def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">; } diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -61,7 +61,8 @@ // ------------------- // prototype: return (arg, arg, ...) // -// 2,3,4: array of default vectors +// 2,3,4: array of vectors +// .: indicator for multi-vector modifier that will follow(eg.:2.x) // v: void // x: vector of signed integers // u: vector of unsigned integers @@ -91,6 +92,7 @@ // l: int64_t // m: uint32_t // n: uint64_t +// y: bool // t: svint32_t // z: svuint32_t @@ -98,10 +100,10 @@ // O: svfloat16_t // M: svfloat32_t // N: svfloat64_t +// $: svbfloat16_t // J: Prefetch type (sv_prfop) - -// %: pointer to void +// p: pointer to element type // A: pointer to int8_t // B: pointer to int16_t @@ -125,7 +127,11 @@ // Y: const pointer to uint32_t // Z: const pointer to uint64_t +// Prototype modifiers added for SME +// %: pointer to void (upstream) +// // Prototype modifiers added for SVE2p1 +// {: 128b vector // }: svcount_t class MergeType { @@ -147,15 +153,15 @@ def EltTyInt16 : EltType<2>; def EltTyInt32 : EltType<3>; def EltTyInt64 : EltType<4>; -def EltTyInt128 : EltType<5>; -def EltTyFloat16 : EltType<6>; -def EltTyFloat32 : EltType<7>; -def EltTyFloat64 : EltType<8>; -def EltTyBool8 : EltType<9>; -def EltTyBool16 : EltType<10>; -def EltTyBool32 : EltType<11>; -def EltTyBool64 : EltType<12>; -def EltTyBFloat16 : EltType<13>; +def EltTyFloat16 : EltType<5>; +def EltTyFloat32 : EltType<6>; +def EltTyFloat64 : EltType<7>; +def EltTyBool8 : EltType<8>; +def EltTyBool16 : EltType<9>; +def EltTyBool32 : EltType<10>; +def EltTyBool64 : EltType<11>; +def EltTyBFloat16 : EltType<12>; +def EltTyInt128 : EltType<13>; // FIXME: is this necessary? class MemEltType { int Value = val; @@ -165,6 +171,7 @@ def MemEltTyInt16 : MemEltType<2>; def MemEltTyInt32 : MemEltType<3>; def MemEltTyInt64 : MemEltType<4>; +def MemEltTyInt128 : MemEltType<5>; class FlagType { int Value = val; @@ -222,6 +229,10 @@ def IsPreservesZA : FlagType<0x10000000000>; def IsReadZA : FlagType<0x20000000000>; def IsWriteZA : FlagType<0x40000000000>; +def IsZASliceBaseOffsetIntr : FlagType<0x80000000000>; +def IsTileBaseOffsetIntr : FlagType<0x100000000000>; +def IsStreamingOrSVE2p1 : FlagType<0x200000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1. +def IsReductionQV : FlagType<0x400000000000>; // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h class ImmCheckType { @@ -246,36 +257,19 @@ def ImmCheck0_0 : ImmCheckType<16>; // 0..0 def ImmCheck0_15 : ImmCheckType<17>; // 0..15 def ImmCheck0_255 : ImmCheckType<18>; // 0..255 +def ImmCheck0_2_Mul2 : ImmCheckType<19>; // 0, 2 +def ImmCheck0_6_Mul2 : ImmCheckType<20>; // 0, 2, 4, 6 +def ImmCheck0_14_Mul2 : ImmCheckType<21>; // 0, 2, .., 14 +def ImmCheck0_4_Mul4 : ImmCheckType<22>; // 0, 4 +def ImmCheck0_12_Mul4 : ImmCheckType<23>; // 0, 4, 8, 12 +def ImmCheck2_4_Mul2 : ImmCheckType<24>; // 2, 4 +def ImmCheck0_56_Mul8 : ImmCheckType<25>; // 0, 8, .., 56 +def ImmCheck1_1 : ImmCheckType<26>; // 1..1 +def ImmCheck1_3 : ImmCheckType<27>; // 1..3 +def ImmCheck1_7 : ImmCheckType<28>; // 1..7 class ImmCheck { int Arg = arg; int EltSizeArg = eltSizeArg; ImmCheckType Kind = kind; } - -class Inst ft, list ch, MemEltType met> { - string Name = n; - string Prototype = p; - string Types = t; - string TargetGuard = "sve"; - int Merge = mt.Value; - string MergeSuffix = mt.Suffix; - string LLVMIntrinsic = i; - list Flags = ft; - list ImmChecks = ch; - int MemEltType = met.Value; -} - -// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8") -class SInst ft = [], list ch = []> - : Inst { -} - -// MInst: Instructions which access memory -class MInst f, - MemEltType met = MemEltTyDefault, string i = "", - list ch = []> - : Inst { -} diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13641,7 +13641,10 @@ bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool ParseSVEImmChecks(CallExpr *TheCall, + SmallVector, 3> &ImmChecks); bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg, diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -50,6 +50,7 @@ bool HasMatMul = false; bool HasBFloat16 = false; bool HasSVE2 = false; + bool HasSVE2p1 = false; bool HasSVE2AES = false; bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; @@ -70,6 +71,7 @@ bool HasSME = false; bool HasSMEF64F64 = false; bool HasSMEI16I64 = false; + bool HasSME2 = false; bool HasSB = false; bool HasPredRes = false; bool HasSSBS = false; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -403,6 +403,18 @@ if (HasSVE2 && HasSVE2AES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); + if (HasSME) + Builder.defineMacro("__ARM_FEATURE_SME", "1"); + + if (HasSMEI16I64) + Builder.defineMacro("__ARM_FEATURE_SME_I16I64", "1"); + + if (HasSMEF64F64) + Builder.defineMacro("__ARM_FEATURE_SME_F64F64", "1"); + + if (HasSME2) + Builder.defineMacro("__ARM_FEATURE_SME2", "1"); + if (HasSVE2 && HasSVE2BitPerm) Builder.defineMacro("__ARM_FEATURE_SVE2_BITPERM", "1"); @@ -477,9 +489,8 @@ Builder.defineMacro("__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", "1"); } - if ((FPU & SveMode) && HasBFloat16) { + if ((FPU & SveMode) && HasBFloat16) Builder.defineMacro("__ARM_FEATURE_SVE_BF16", "1"); - } if ((FPU & SveMode) && HasMatmulFP64) Builder.defineMacro("__ARM_FEATURE_SVE_MATMUL_FP64", "1"); @@ -575,7 +586,7 @@ } ArrayRef AArch64TargetInfo::getTargetBuiltins() const { - return llvm::ArrayRef(BuiltinInfo, clang::AArch64::LastTSBuiltin - + return llvm::makeArrayRef(BuiltinInfo, clang::AArch64::LastTSBuiltin - Builtin::FirstTSBuiltin); } @@ -659,11 +670,13 @@ .Case("f32mm", FPU & SveMode && HasMatmulFP32) .Case("f64mm", FPU & SveMode && HasMatmulFP64) .Case("sve2", FPU & SveMode && HasSVE2) + .Case("sve2p1", HasSVE2p1) .Case("sve2-pmull128", FPU & SveMode && HasSVE2AES) .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm) .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sme", HasSME) + .Case("sme2", HasSME2) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Cases("memtag", "memtag2", HasMTE) @@ -738,6 +751,13 @@ HasFullFP16 = true; HasSVE2 = true; } + if (Feature == "+sve2p1") { + FPU |= NeonMode; + FPU |= SveMode; + HasFullFP16 = true; + HasSVE2 = true; + HasSVE2p1 = true; + } if (Feature == "+sve2-aes") { FPU |= NeonMode; FPU |= SveMode; @@ -779,22 +799,32 @@ HasMatmulFP64 = true; } if (Feature == "+sme") { + FPU |= SveMode; HasSME = true; HasBFloat16 = true; HasFullFP16 = true; } if (Feature == "+sme-f64f64") { + FPU |= SveMode; HasSME = true; HasSMEF64F64 = true; HasBFloat16 = true; HasFullFP16 = true; } if (Feature == "+sme-i16i64") { + FPU |= SveMode; HasSME = true; HasSMEI16I64 = true; HasBFloat16 = true; HasFullFP16 = true; } + if (Feature == "+sme2") { + FPU |= SveMode; + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + } if (Feature == "+sb") HasSB = true; if (Feature == "+predres") @@ -1172,7 +1202,7 @@ }; ArrayRef AArch64TargetInfo::getGCCRegNames() const { - return llvm::ArrayRef(GCCRegNames); + return llvm::makeArrayRef(GCCRegNames); } const TargetInfo::GCCRegAlias AArch64TargetInfo::GCCRegAliases[] = { @@ -1215,7 +1245,7 @@ }; ArrayRef AArch64TargetInfo::getGCCRegAliases() const { - return llvm::ArrayRef(GCCRegAliases); + return llvm::makeArrayRef(GCCRegAliases); } // Returns the length of cc constraint. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8965,6 +8965,8 @@ return Builder.getInt32Ty(); case SVETypeFlags::MemEltTyInt64: return Builder.getInt64Ty(); + case SVETypeFlags::MemEltTyInt128: + return Builder.getInt128Ty(); } llvm_unreachable("Unknown MemEltType"); } @@ -9093,6 +9095,10 @@ // the elements of the specified datatype. Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred, llvm::ScalableVectorType *VTy) { + if (isa(Pred->getType()) && + cast(Pred->getType())->getName() == "aarch64.svcount") + return Pred; + auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy); if (Pred->getType() == RTy) return Pred; @@ -9267,12 +9273,19 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld1_pn_x2: + case Intrinsic::aarch64_sve_ldnt1_pn_x2: + case Intrinsic::aarch64_sve_ld2q_sret: N = 2; break; case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld3q_sret: N = 3; break; case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld1_pn_x4: + case Intrinsic::aarch64_sve_ldnt1_pn_x4: + case Intrinsic::aarch64_sve_ld4q_sret: N = 4; break; default: @@ -9308,12 +9321,19 @@ unsigned N; switch (IntID) { case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_st1_pn_x2: + case Intrinsic::aarch64_sve_stnt1_pn_x2: + case Intrinsic::aarch64_sve_st2q: N = 2; break; case Intrinsic::aarch64_sve_st3: + case Intrinsic::aarch64_sve_st3q: N = 3; break; case Intrinsic::aarch64_sve_st4: + case Intrinsic::aarch64_sve_st1_pn_x4: + case Intrinsic::aarch64_sve_stnt1_pn_x4: + case Intrinsic::aarch64_sve_st4q: N = 4; break; default: @@ -9323,8 +9343,7 @@ Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy); Value *BasePtr = Ops[1]; - // Does the store have an offset? - if (Ops.size() > 3) + if (Ops.size() > (2 + N)) BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]); Value *Val = Ops.back(); @@ -9332,15 +9351,13 @@ // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we // need to break up the tuple vector. SmallVector Operands; - unsigned MinElts = VTy->getElementCount().getKnownMinValue(); - for (unsigned I = 0; I < N; ++I) { - Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); - Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx)); - } + for (unsigned I = Ops.size() - N; I < Ops.size(); ++I) + Operands.push_back(Ops[I]); Operands.append({Predicate, BasePtr}); - Function *F = CGM.getIntrinsic(IntID, { VTy }); + return Builder.CreateCall(F, Operands); + } // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and @@ -9394,7 +9411,7 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, llvm::Type *ReturnTy, SmallVectorImpl &Ops, - unsigned BuiltinID, + unsigned IntrinsicID, bool IsZExtReturn) { QualType LangPTy = E->getArg(1)->getType(); llvm::Type *MemEltTy = CGM.getTypes().ConvertType( @@ -9403,28 +9420,47 @@ // The vector type that is returned may be different from the // eventual type loaded from memory. auto VectorTy = cast(ReturnTy); - auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); + llvm::ScalableVectorType *MemoryTy = nullptr; + llvm::ScalableVectorType *PredTy = nullptr; + bool IsExtendingLoad = true; + switch (IntrinsicID) { + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_ld1udq: + MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1); + PredTy = + llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1); + IsExtendingLoad = false; + break; + default: + MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); + PredTy = MemoryTy; + break; + } - Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy); + Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy); Value *BasePtr = Ops[1]; // Does the load have an offset? if (Ops.size() > 2) BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]); - Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy); + Function *F = + CGM.getIntrinsic(IntrinsicID, IsExtendingLoad ? MemoryTy : VectorTy); auto *Load = cast(Builder.CreateCall(F, {Predicate, BasePtr})); auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType()); CGM.DecorateInstructionWithTBAA(Load, TBAAInfo); + if (!IsExtendingLoad) + return Load; + return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy) - : Builder.CreateSExt(Load, VectorTy); + : Builder.CreateSExt(Load, VectorTy); } Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, SmallVectorImpl &Ops, - unsigned BuiltinID) { + unsigned IntrinsicID) { QualType LangPTy = E->getArg(1)->getType(); llvm::Type *MemEltTy = CGM.getTypes().ConvertType( LangPTy->castAs()->getPointeeType()); @@ -9434,17 +9470,34 @@ auto VectorTy = cast(Ops.back()->getType()); auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy); - Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy); + auto PredTy = MemoryTy; + auto AddrMemoryTy = MemoryTy; + bool IsTruncatingStore = true; + ; + switch (IntrinsicID) { + case Intrinsic::aarch64_sve_st1uwq: + case Intrinsic::aarch64_sve_st1udq: + AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1); + PredTy = + llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1); + IsTruncatingStore = false; + break; + default: + break; + } + Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy); Value *BasePtr = Ops[1]; // Does the store have an offset? if (Ops.size() == 4) - BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]); + BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]); // Last value is always the data - llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy); + Value *Val = IsTruncatingStore ? Builder.CreateTrunc(Ops.back(), MemoryTy) + : Ops.back(); - Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy); + Function *F = + CGM.getIntrinsic(IntrinsicID, IsTruncatingStore ? MemoryTy : VectorTy); auto *Store = cast(Builder.CreateCall(F, {Val, Predicate, BasePtr})); auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType()); @@ -9459,32 +9512,47 @@ Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags, SmallVectorImpl &Ops, - unsigned IntID) { - Ops[3] = EmitSVEPredicateCast( - Ops[3], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags))); - - SmallVector NewOps; - NewOps.push_back(Ops[3]); - - llvm::Value *BasePtr = Ops[4]; - - // If the intrinsic contains the vnum parameter, multiply it with the vector - // size in bytes. - if (Ops.size() == 6) { - Function *StreamingVectorLength = - CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *StreamingVectorLengthCall = - Builder.CreateCall(StreamingVectorLength); - llvm::Value *Mulvl = - Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl"); - // The type of the ptr parameter is void *, so use Int8Ty here. - BasePtr = Builder.CreateGEP(Int8Ty, Ops[4], Mulvl); - } - NewOps.push_back(BasePtr); - NewOps.push_back(Ops[0]); - NewOps.push_back(EmitTileslice(Ops[2], Ops[1])); - Function *F = CGM.getIntrinsic(IntID); - return Builder.CreateCall(F, NewOps); + unsigned BuiltinID) { + Function *F = CGM.getIntrinsic(BuiltinID); + Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + + // We support 3 classes of builtins here: + // * 3-op forms: + // svldr_vnum_za(slice_base, slice_offset, ptr) + // svstr_vnum_za(slice_base, slice_offset, ptr) + if (Ops.size() == 3) { + // For the 3-op form, slice_offset == vnum. + Value *VNum = Ops[1]; + llvm::Value *SVL = Builder.CreateCall(Cntsb); + Value *Offset = + Builder.CreateMul(Builder.CreateZExtOrTrunc(VNum, SVL->getType()), SVL); + Value *Ptr = Builder.CreateGEP(Builder.getInt8Ty(), /*Ptr*/ Ops[2], Offset); + Value *Slice = EmitTileslice(VNum, /*Base*/Ops[0]); + return Builder.CreateCall(F, {Slice, Ptr}); + } + + Value *Ptr = Ops[4]; + Value *Slice = EmitTileslice(Ops[2], Ops[1]); + llvm::ScalableVectorType *OverloadedTy = + getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)); + Value *Pn = EmitSVEPredicateCast(/*Pn*/ Ops[3], OverloadedTy); + + // * 5-op forms: + // svld1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + // svst1_(ver|hor)_zaX(tile, slice_base, slice_offset, pn, ptr) + if (Ops.size() == 5) + return Builder.CreateCall(F, {Pn, Ptr, /*Tile*/ Ops[0], Slice}); + + // + // * 6-op forms: + // svld1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + // svst1_(ver|hor)_vnum_zaX(tile, slice_base, slice_offset, pn, ptr, vnum) + assert (Ops.size() == 6 && "Unexpected number of ops"); + Value *SVL = Builder.CreateCall(Cntsb); + Value *Offset = Builder.CreateMul( + Builder.CreateZExtOrTrunc(/*VNum*/ Ops[5], SVL->getType()), SVL); + Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Ptr, Offset); + return Builder.CreateCall(F, {Pn, Ptr, /*Tile*/ Ops[0], Slice}); } Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags, @@ -9581,10 +9649,47 @@ if (TypeFlags.isOverloadCvt()) return {Ops[0]->getType(), Ops.back()->getType()}; + if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() && + ResultType->isVectorTy()) + return { ResultType, Ops[1]->getType() }; + assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads"); return {DefaultType}; } +Value *CodeGenFunction::FormMultiVectorResult(Value *Call) { + // Multi-vector results should be broken up into a single (wide) result + // vector. + if (auto *StructTy = dyn_cast(Call->getType())) { + if (auto *VTy = + dyn_cast(StructTy->getTypeAtIndex(0U))) { + unsigned N = StructTy->getNumElements(); + + // We may need to emit a cast to a svbool_t + bool IsPredTy = VTy->getElementType()->isIntegerTy(1); + unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements(); + + ScalableVectorType *WideVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts * N); + Value *Ret = llvm::PoisonValue::get(WideVTy); + for (unsigned I = 0; I < N; ++I) { + Value *SRet = Builder.CreateExtractValue(Call, I); + assert(SRet->getType() == VTy && "Unexpected type for result value"); + Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); + + if (IsPredTy) + SRet = EmitSVEPredicateCast( + SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16)); + + Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx); + } + Call = Ret; + } + } + + return Call; +} + Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags, llvm::Type *Ty, ArrayRef Ops) { @@ -9618,24 +9723,34 @@ return Call; } -Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { +void CodeGenFunction::GetAArch64SMEProcessedOperands( + unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, + SVETypeFlags TypeFlags) { // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && - BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { - Value *Val = EmitScalarExpr(E->getArg(0)); - return EmitSVEReinterpret(Val, Ty); - } + bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet(); - llvm::SmallVector Ops; for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - if ((ICEArguments & (1 << i)) == 0) + if (!IsTupleGetOrSet && (ICEArguments & (1 << i)) == 0) { + Value *Arg = EmitScalarExpr(E->getArg(i)); + if (auto *VTy = dyn_cast(Arg->getType())) { + unsigned MinElts = VTy->getMinNumElements(); + bool IsPred = VTy->getElementType()->isIntegerTy(1); + unsigned N = + (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128); + for (unsigned I = 0; I < N; ++I) { + Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N); + auto *NewVTy = + ScalableVectorType::get(VTy->getElementType(), MinElts / N); + Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx)); + } + } else + Ops.push_back(Arg); + } else if ((ICEArguments & (1 << i)) == 0) Ops.push_back(EmitScalarExpr(E->getArg(i))); else { // If this is required to be a constant, constant fold it so that we know @@ -9652,9 +9767,25 @@ } } + return; +} + +Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + llvm::Type *Ty = ConvertType(E->getType()); + if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 && + BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) { + Value *Val = EmitScalarExpr(E->getArg(0)); + return EmitSVEReinterpret(Val, Ty); + } + + auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID, AArch64SVEIntrinsicsProvenSorted); + llvm::SmallVector Ops; SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + if (TypeFlags.isLoad()) return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic, TypeFlags.isZExtReturn()); @@ -9668,10 +9799,10 @@ return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isGatherPrefetch()) return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructLoad()) - return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (TypeFlags.isStructStore()) - return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructLoad()) + return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isStructStore()) + return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops); else if (TypeFlags.isTupleCreate()) @@ -9722,6 +9853,17 @@ Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero); } + // Needed for the psel instruction. + // FIXME: This isn't really a ZA slice, but rather a lane index that takes + // the same form of w12-w15 reg + immediate offset. + if (TypeFlags.isZASliceBaseOffsetIntr()) { + llvm::Value *SliceBase = Ops[2]; + llvm::Value *SliceOff = Ops[3]; + llvm::Value *Slice = EmitTileslice(SliceOff, SliceBase); + Ops[2] = Slice; + Ops.erase(Ops.begin() + 3); + } + Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, getSVEOverloadTypes(TypeFlags, Ty, Ops)); Value *Call = Builder.CreateCall(F, Ops); @@ -9731,13 +9873,52 @@ if (PredTy->getScalarType()->isIntegerTy(1)) Call = EmitSVEPredicateCast(Call, cast(Ty)); - return Call; + return FormMultiVectorResult(Call); } switch (BuiltinID) { default: return nullptr; + case SVE::BI__builtin_sve_svreinterpret_b: { + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastFromSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); + return Builder.CreateCall(CastFromSVCountF, Ops[0]); + } + case SVE::BI__builtin_sve_svreinterpret_c: { + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); + return Builder.CreateCall(CastToSVCountF, Ops[0]); + } + + case SVE::BI__builtin_sve_svpsel_lane_b8: + case SVE::BI__builtin_sve_svpsel_lane_b16: + case SVE::BI__builtin_sve_svpsel_lane_b32: + case SVE::BI__builtin_sve_svpsel_lane_b64: + case SVE::BI__builtin_sve_svpsel_lane_c8: + case SVE::BI__builtin_sve_svpsel_lane_c16: + case SVE::BI__builtin_sve_svpsel_lane_c32: + case SVE::BI__builtin_sve_svpsel_lane_c64: { + bool IsSVCount = isa(Ops[0]->getType()); + assert(((!IsSVCount || cast(Ops[0]->getType())->getName() == + "aarch64.svcount")) && + "Unexpected TargetExtType"); + auto SVCountTy = llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + Function *CastFromSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); + + auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier)); + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy); + llvm::Value *Ops0 = IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0]; + llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy); + llvm::Value *Ops2 = EmitTileslice(Ops[3], Ops[2]); + llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops2}); + return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel; + } case SVE::BI__builtin_sve_svmov_b_z: { // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op) SVETypeFlags TypeFlags(Builtin->TypeModifier); @@ -9859,6 +10040,13 @@ case SVE::BI__builtin_sve_svpfalse_b: return ConstantInt::getFalse(Ty); + case SVE::BI__builtin_sve_svpfalse_c: { + auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16); + Function *CastToSVCountF = CGM.getIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, Ty); + return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy)); + } + case SVE::BI__builtin_sve_svlen_bf16: case SVE::BI__builtin_sve_svlen_f16: case SVE::BI__builtin_sve_svlen_f32: @@ -9894,13 +10082,8 @@ case SVE::BI__builtin_sve_svtbl2_f64: { SVETypeFlags TF(Builtin->TypeModifier); auto VTy = cast(getSVEType(TF)); - Value *V0 = Builder.CreateExtractVector(VTy, Ops[0], - ConstantInt::get(CGM.Int64Ty, 0)); - unsigned MinElts = VTy->getMinNumElements(); - Value *V1 = Builder.CreateExtractVector( - VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts)); Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy); - return Builder.CreateCall(F, {V0, V1, Ops[1]}); + return Builder.CreateCall(F, Ops); } case SVE::BI__builtin_sve_svset_neonq_s8: @@ -9958,35 +10141,20 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); - llvm::Type *Ty = ConvertType(E->getType()); - llvm::SmallVector Ops; - for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - if ((ICEArguments & (1 << i)) == 0) - Ops.push_back(EmitScalarExpr(E->getArg(i))); - else { - // If this is required to be a constant, constant fold it so that we know - // that the generated intrinsic gets a ConstantInt. - std::optional Result = - E->getArg(i)->getIntegerConstantExpr(getContext()); - assert(Result && "Expected argument to be a constant"); - - // Immediates for SVE llvm intrinsics are always 32bit. We can safely - // truncate because the immediate has been range checked and no valid - // immediate requires more than a handful of bits. - *Result = Result->extOrTrunc(32); - Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result)); - } - } - auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID, AArch64SMEIntrinsicsProvenSorted); + + llvm::SmallVector Ops; SVETypeFlags TypeFlags(Builtin->TypeModifier); + GetAArch64SMEProcessedOperands(BuiltinID, E, Ops, TypeFlags); + if (TypeFlags.isLoad() || TypeFlags.isStore()) return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) @@ -9997,21 +10165,60 @@ else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za || BuiltinID == SME::BI__builtin_sme_svstr_vnum_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); - else if (Builtin->LLVMIntrinsic != 0) { - // Predicates must match the main datatype. - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (auto PredTy = dyn_cast(Ops[i]->getType())) - if (PredTy->getElementType()->isIntegerTy(1)) - Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags)); - Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, - getSVEOverloadTypes(TypeFlags, Ty, Ops)); - Value *Call = Builder.CreateCall(F, Ops); - return Call; + // Should not happen! + if (Builtin->LLVMIntrinsic == 0) + return nullptr; + + if (TypeFlags.isZASliceBaseOffsetIntr() || TypeFlags.isTileBaseOffsetIntr()) { + unsigned BaseIdx = TypeFlags.isZASliceBaseOffsetIntr() ? 0 : 1; + + llvm::Value *SliceBase = Ops[BaseIdx]; + llvm::Value *SliceOff = Ops[BaseIdx + 1]; + llvm::Value *Slice = EmitTileslice(SliceOff, SliceBase); + Ops[BaseIdx] = Slice; + Ops.erase(Ops.begin() + BaseIdx + 1); } - /// Should not happen - return nullptr; + // Predicates must match the main datatype. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (auto PredTy = dyn_cast(Ops[i]->getType())) + if (PredTy->getElementType()->isIntegerTy(1)) { + PredTy = getSVEType(TypeFlags); + Ops[i] = EmitSVEPredicateCast(Ops[i], PredTy); + } + + if (TypeFlags.isReadZA() || TypeFlags.isWriteZA()) { + llvm::ScalableVectorType *FnType = getSVEType(TypeFlags); + Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic, {FnType}); + + // FIXME! We probably shouldn't be adding the slice_base (Op[1]) to the + // slice_offset (Op[2]) here. Instead I think it makes sense to change the + // LLVM intrinsic to match the ACLE intrinsic. + llvm::Value *SliceBase = TypeFlags.isReadZA() ? Ops[3] : Ops[1]; + llvm::Value *SliceOff = TypeFlags.isReadZA() ? Ops[4] : Ops[2]; + llvm::Value *Slice = EmitTileslice(SliceOff, SliceBase); + + if (TypeFlags.isReadZA()) + return Builder.CreateCall( + F, {/*Passthru*/ Ops[0], /*Pg*/ Ops[1], /*Tile*/ Ops[2], Slice}); + else + return Builder.CreateCall( + F, {/*Tile*/ Ops[0], Slice, /*Pg*/ Ops[3], /*Write data*/ Ops[4]}); + } + + Function *F = + TypeFlags.isOverloadNone() + ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) + : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); + Value *Call = Builder.CreateCall(F, Ops); + + // Predicate results must be converted to svbool_t. + if (auto PredTy = dyn_cast(Call->getType())) + if (PredTy->getScalarType()->isIntegerTy(1)) + Call = EmitSVEPredicateCast(Call, ScalableVectorType::get(Builder.getInt1Ty(), 16)); + + return FormMultiVectorResult(Call); } Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, @@ -10058,6 +10265,26 @@ return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); } + if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) { + // Create call to __arm_sme_state and store the results to the two pointers. + CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {}, + false), + "__arm_sme_state")); + auto Attrs = + AttributeList() + .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved"); + CI->setAttributes(Attrs); + CI->setCallingConv( + llvm::CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + Builder.CreateStore(Builder.CreateExtractValue(CI, 0), + EmitPointerWithAlignment(E->getArg(0))); + return Builder.CreateStore(Builder.CreateExtractValue(CI, 1), + EmitPointerWithAlignment(E->getArg(1))); + } + if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) { assert((getContext().getTypeSize(E->getType()) == 32) && "rbit of unusual size!"); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4285,8 +4285,13 @@ llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned IntID); - llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + void GetAArch64SMEProcessedOperands(unsigned BuiltinID, const CallExpr *E, + SmallVectorImpl &Ops, + SVETypeFlags TypeFlags); + llvm::Value *FormMultiVectorResult(llvm::Value *Call); + + llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSMELd1St1(SVETypeFlags TypeFlags, llvm::SmallVectorImpl &Ops, unsigned IntID); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -358,8 +358,8 @@ clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) # Generate arm_sve.h clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h) - # Generate arm_sme_draft_spec_subject_to_change.h - clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme_draft_spec_subject_to_change.h) + # Generate arm_sme.h + clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h) # Generate arm_bf16.h clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h) # Generate arm_mve.h @@ -380,7 +380,7 @@ list(APPEND aarch64_only_generated_files "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h" - "${CMAKE_CURRENT_BINARY_DIR}/arm_sme_draft_spec_subject_to_change.h" + "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h" "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h" ) endif() diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -2058,7 +2058,8 @@ llvm::StringMap CallerFeatureMap; Context.getFunctionFeatureMap(CallerFeatureMap, FD); if (!Builtin::evaluateRequiredTargetFeatures( - "sve", CallerFeatureMap)) + "sve", CallerFeatureMap) && !Builtin::evaluateRequiredTargetFeatures( + "sme", CallerFeatureMap)) Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty; } }; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2914,21 +2914,8 @@ llvm_unreachable("Invalid NeonTypeFlag!"); } -bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { - // Range check SVE intrinsics that take immediate values. - SmallVector, 3> ImmChecks; - - switch (BuiltinID) { - default: - return false; -#define GET_SVE_IMMEDIATE_CHECK -#include "clang/Basic/arm_sve_sema_rangechecks.inc" -#undef GET_SVE_IMMEDIATE_CHECK -#define GET_SME_IMMEDIATE_CHECK -#include "clang/Basic/arm_sme_sema_rangechecks.inc" -#undef GET_SME_IMMEDIATE_CHECK - } - +bool Sema::ParseSVEImmChecks( + CallExpr *TheCall, SmallVector, 3> &ImmChecks) { // Perform all the immediate checks for this builtin call. bool HasError = false; for (auto &I : ImmChecks) { @@ -3044,14 +3031,214 @@ if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255)) HasError = true; break; + case SVETypeFlags::ImmCheck0_2_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 2) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_6_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 6) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_14_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 14) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_4_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 4) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_12_Mul4: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 12) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 4)) + HasError = true; + break; + case SVETypeFlags::ImmCheck2_4_Mul2: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2)) + HasError = true; + break; + case SVETypeFlags::ImmCheck0_56_Mul8: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 56) || + SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 8)) + HasError = true; + break; + case SVETypeFlags::ImmCheck1_1: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 1)) + HasError = true; + break; + case SVETypeFlags::ImmCheck1_3: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 3)) + HasError = true; + break; + case SVETypeFlags::ImmCheck1_7: + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7)) + HasError = true; + break; } } return HasError; } +enum ArmStreamingType { + ArmNonStreaming, + ArmStreaming, + ArmStreamingCompatible, + ArmLocallyStreaming, + ArmStreamingOrSVE2p1 +}; + +static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { + if (FD->hasAttr()) + return ArmLocallyStreaming; + if (const auto *T = dyn_cast(FD->getType())) { + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } + return ArmNonStreaming; +} + +static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall, + const FunctionDecl *FD, + ArmStreamingType BuiltinType) { + assert(BuiltinType != ArmLocallyStreaming && + "Unexpected locally_streaming attribute for builtin!"); + + ArmStreamingType FnType = getArmStreamingFnType(FD); + if (BuiltinType == ArmStreamingOrSVE2p1) { + // Check intrinsics that are available in [sve2p1 or sme/sme2]. + llvm::StringMap CallerFeatureMap; + S.Context.getFunctionFeatureMap(CallerFeatureMap, FD); + if (Builtin::evaluateRequiredTargetFeatures("sve2p1", CallerFeatureMap)) + BuiltinType = ArmStreamingCompatible; + else + BuiltinType = ArmStreaming; + } + + if ((FnType == ArmStreaming || FnType == ArmLocallyStreaming) && + BuiltinType == ArmNonStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming or locally streaming"; + + if ((FnType == ArmStreamingCompatible) && + BuiltinType != ArmStreamingCompatible) { + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "streaming compatible"; + return; + } + + if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) + S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin) + << TheCall->getSourceRange() << "non-streaming"; +} + +static bool hasSMEZAState(const FunctionDecl *FD) { + if (FD->hasAttr()) + return true; + if (const auto *T = dyn_cast(FD->getType())) + if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask) + return true; + return false; +} + +static bool hasSMEZAState(unsigned BuiltinID) { + switch (BuiltinID) { + default: + return false; +#define GET_SME_BUILTIN_HAS_ZA_STATE +#include "clang/Basic/arm_sme_builtins_za_state.inc" +#undef GET_SME_BUILTIN_HAS_ZA_STATE + } +} + +bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_SME_STREAMING_ATTRS +#include "clang/Basic/arm_sme_streaming_attrs.inc" +#undef GET_SME_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + + if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD)) + Diag(TheCall->getBeginLoc(), + diag::warn_attribute_arm_za_builtin_no_za_state) + << TheCall->getSourceRange(); + } + + // Range check SME intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SME_IMMEDIATE_CHECK +#include "clang/Basic/arm_sme_sema_rangechecks.inc" +#undef GET_SME_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + +bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_SVE_STREAMING_ATTRS +#include "clang/Basic/arm_sve_streaming_attrs.inc" +#undef GET_SVE_STREAMING_ATTRS + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + } + + // Range check SVE intrinsics that take immediate values. + SmallVector, 3> ImmChecks; + + switch (BuiltinID) { + default: + return false; +#define GET_SVE_IMMEDIATE_CHECK +#include "clang/Basic/arm_sve_sema_rangechecks.inc" +#undef GET_SVE_IMMEDIATE_CHECK + } + + return ParseSVEImmChecks(TheCall, ImmChecks); +} + bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall) { + if (const FunctionDecl *FD = getCurFunctionDecl()) { + std::optional BuiltinType; + + switch (BuiltinID) { + default: + break; +#define GET_NEON_STREAMING_COMPAT_FLAG +#include "clang/Basic/arm_neon.inc" +#undef GET_NEON_STREAMING_COMPAT_FLAG + } + + if (BuiltinType) + checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType); + } + llvm::APSInt Result; uint64_t mask = 0; unsigned TV = 0; @@ -3414,6 +3601,9 @@ if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall)) return true; + if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall)) + return true; + // For intrinsics which take an immediate value as part of the instruction, // range check them here. unsigned i = 0, l = 0, u = 0; diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp --- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -278,18 +278,18 @@ preserves_za_decl); } -// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } -// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } +// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" } // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" } // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za32_u32( // CHECK-CXX-LABEL: @_Z21test_svaddha_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -25,6 +26,7 @@ SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za32_u32_1( // CHECK-CXX-LABEL: @_Z23test_svaddha_za32_u32_1u10__SVBool_tu10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -37,6 +39,7 @@ SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za32_s32( // CHECK-CXX-LABEL: @_Z21test_svaddha_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -49,6 +52,7 @@ SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za32_s32_1( // CHECK-CXX-LABEL: @_Z23test_svaddha_za32_s32_1u10__SVBool_tu10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -61,6 +65,7 @@ SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za32_u32( // CHECK-CXX-LABEL: @_Z21test_svaddva_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -73,6 +78,7 @@ SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za32_u32_1( // CHECK-CXX-LABEL: @_Z23test_svaddva_za32_u32_1u10__SVBool_tu10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -85,6 +91,7 @@ SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za32_s32( // CHECK-CXX-LABEL: @_Z21test_svaddva_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -97,6 +104,7 @@ SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za32_s32_1( // CHECK-CXX-LABEL: @_Z23test_svaddva_za32_s32_1u10__SVBool_tu10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za64_u64( // CHECK-CXX-LABEL: @_Z21test_svaddha_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -25,6 +26,7 @@ SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za64_u64_1( // CHECK-CXX-LABEL: @_Z23test_svaddha_za64_u64_1u10__SVBool_tu10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -37,6 +39,7 @@ SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za64_s64( // CHECK-CXX-LABEL: @_Z21test_svaddha_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -49,6 +52,7 @@ SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddha_za64_s64_1( // CHECK-CXX-LABEL: @_Z23test_svaddha_za64_s64_1u10__SVBool_tu10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -61,6 +65,7 @@ SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za64_u64( // CHECK-CXX-LABEL: @_Z21test_svaddva_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -73,6 +78,7 @@ SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za64_u64_1( // CHECK-CXX-LABEL: @_Z23test_svaddva_za64_u64_1u10__SVBool_tu10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -85,6 +91,7 @@ SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za64_s64( // CHECK-CXX-LABEL: @_Z21test_svaddva_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -97,6 +104,7 @@ SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svaddva_za64_s64_1( // CHECK-CXX-LABEL: @_Z23test_svaddva_za64_s64_1u10__SVBool_tu10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add.c @@ -0,0 +1,188 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + SVE_ACLE_FUNC(svaddha_za32,_s32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + SVE_ACLE_FUNC(svaddha_za32,_u32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + SVE_ACLE_FUNC(svaddha_za64,_s64,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddha_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddha_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + SVE_ACLE_FUNC(svaddha_za64,_u64,_m,)(3, pn, pm, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_s32u10__SVBool_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + SVE_ACLE_FUNC(svaddva_za32,_s32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za32_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za32_u32u10__SVBool_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + SVE_ACLE_FUNC(svaddva_za32,_u32,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_s64u10__SVBool_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + SVE_ACLE_FUNC(svaddva_za64,_s64,_m,)(3, pn, pm, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svaddva_za64_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svaddva_za64_u64u10__SVBool_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + SVE_ACLE_FUNC(svaddva_za64,_u64,_m,)(3, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c @@ -1,46 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -#include +#include -// CHECK-C-LABEL: @test_svcntsb( -// CHECK-CXX-LABEL: @_Z12test_svcntsbv( +// CHECK-LABEL: @test_svcntsb( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() // CHECK-NEXT: ret i64 [[TMP0]] // -uint64_t test_svcntsb() { +// CPP-CHECK-LABEL: @_Z12test_svcntsbv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsb(void) { return svcntsb(); } -// CHECK-C-LABEL: @test_svcntsh( -// CHECK-CXX-LABEL: @_Z12test_svcntshv( +// CHECK-LABEL: @test_svcntsh( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() // CHECK-NEXT: ret i64 [[TMP0]] // -uint64_t test_svcntsh() { +// CPP-CHECK-LABEL: @_Z12test_svcntshv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsh(void) { return svcntsh(); } -// CHECK-C-LABEL: @test_svcntsw( -// CHECK-CXX-LABEL: @_Z12test_svcntswv( +// CHECK-LABEL: @test_svcntsw( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() // CHECK-NEXT: ret i64 [[TMP0]] // -uint64_t test_svcntsw() { +// CPP-CHECK-LABEL: @_Z12test_svcntswv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsw(void) { return svcntsw(); } -// CHECK-C-LABEL: @test_svcntsd( -// CHECK-CXX-LABEL: @_Z12test_svcntsdv( +// CHECK-LABEL: @test_svcntsd( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() // CHECK-NEXT: ret i64 [[TMP0]] // -uint64_t test_svcntsd() { +// CPP-CHECK-LABEL: @_Z12test_svcntsdv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +__attribute__((arm_streaming_compatible)) +uint64_t test_svcntsd(void) { return svcntsd(); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -1,15 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include - -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif +#include // CHECK-C-LABEL: @test_svld1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv( @@ -19,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_hor_za8(0, slice_base, 0, pg, ptr); svld1_hor_za8(0, slice_base, 15, pg, ptr); } @@ -33,7 +27,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_hor_za16(0, slice_base, 0, pg, ptr); svld1_hor_za16(1, slice_base, 7, pg, ptr); } @@ -47,7 +41,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_hor_za32(0, slice_base, 0, pg, ptr); svld1_hor_za32(3, slice_base, 3, pg, ptr); } @@ -61,7 +55,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_hor_za64(0, slice_base, 0, pg, ptr); svld1_hor_za64(7, slice_base, 1, pg, ptr); } @@ -74,7 +68,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_hor_za128(0, slice_base, 0, pg, ptr); svld1_hor_za128(15, slice_base, 0, pg, ptr); } @@ -87,7 +81,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_ver_za8(0, slice_base, 0, pg, ptr); svld1_ver_za8(0, slice_base, 15, pg, ptr); } @@ -101,7 +95,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_ver_za16(0, slice_base, 0, pg, ptr); svld1_ver_za16(1, slice_base, 7, pg, ptr); } @@ -115,7 +109,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_ver_za32(0, slice_base, 0, pg, ptr); svld1_ver_za32(3, slice_base, 3, pg, ptr); } @@ -129,7 +123,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_ver_za64(0, slice_base, 0, pg, ptr); svld1_ver_za64(7, slice_base, 1, pg, ptr); } @@ -142,7 +136,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { svld1_ver_za128(0, slice_base, 0, pg, ptr); svld1_ver_za128(15, slice_base, 0, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c @@ -1,15 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include - -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif +#include // CHECK-C-LABEL: @test_svld1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl( @@ -22,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum); svld1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum); } @@ -39,7 +33,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum); svld1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum); } @@ -56,7 +50,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum); svld1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum); } @@ -73,7 +67,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum); svld1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum); } @@ -89,7 +83,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum); svld1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum); } @@ -105,7 +99,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum); svld1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum); } @@ -122,7 +116,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum); svld1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum); } @@ -139,7 +133,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum); svld1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum); } @@ -156,7 +150,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum); svld1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum); } @@ -172,7 +166,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) { svld1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum); svld1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -3,8 +3,9 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svldr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( // CHECK-NEXT: entry: @@ -15,6 +16,7 @@ svldr_vnum_za(slice_base, 0, ptr); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svldr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_loads.c @@ -0,0 +1,499 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svld1_ver_vnum_za8ju10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, const int8_t *base) { + svld1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const int16_t *base) { + svld1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const int32_t *base) { + svld1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const int64_t *base) { + svld1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svld1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const __int128_t *base) { + svld1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svldr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svldr_vnum_zajPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svldr_vnum_za(uint32_t slice_base, const uint8_t *base) { + svldr_vnum_za(slice_base, 15, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mem_ops.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_arm_sc_memcpy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memcpyPvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2:[0-9]+]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memcpy(void *dest, const void *src, size_t n) { + return __arm_sc_memcpy(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memmove( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z19test_arm_sc_memmovePvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memmove(void *dest, const void *src, size_t n) { + return __arm_sc_memmove(dest, src, n); +} + +// CHECK-LABEL: @test_arm_sc_memset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memsetPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memset(void *dest, int c, size_t n) { + return __arm_sc_memset(dest, c, n); +} + +// CHECK-LABEL: @test_arm_sc_memchr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z18test_arm_sc_memchrPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[DEST:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR2]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +__attribute__((arm_streaming_compatible)) +void *test_arm_sc_memchr(void *dest, int c, size_t n) { + return __arm_sc_memchr(dest, c, n); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mop.c @@ -0,0 +1,420 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -target-feature +sme-f64f64 -Werror -o - %s | FileCheck %s +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -target-feature +sme-f64f64 -Werror -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -DSVE_OVERLOADED_FORMS -target-feature +sme-f64f64 -Werror -o - %s | FileCheck %s +// RUN: %clang_cc1 -S -O1 -emit-llvm -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 \ +// RUN: -DSVE_OVERLOADED_FORMS -target-feature +sme-f64f64 -Werror -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmopa_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_bf16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_f16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmopa_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_u16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmops_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_bf16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_f16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svmops_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svmops_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmops_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmops_za64,_u16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_f32,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmopa_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmopa_za64,_f64,_m,)(7, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmops_za32,_f32,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_za64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmops_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmops_za64,_f64,_m,)(7, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za32_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svsumops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsumops_za32,_s8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumops_za64_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svsumops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svsumops_za64,_s16,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za32_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svusmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusmops_za32,_u8,_m,)(3, pn, pm, zn, zm); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusmops_za64_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svusmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svusmops_za64,_u16,_m,)(3, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za32_s8( // CHECK-CXX-LABEL: @_Z19test_svmopa_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -23,6 +24,7 @@ SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za32_u8( // CHECK-CXX-LABEL: @_Z19test_svmopa_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -33,6 +35,7 @@ SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za32_bf16( // CHECK-CXX-LABEL: @_Z21test_svmopa_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -45,6 +48,7 @@ SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za32_f16( // CHECK-CXX-LABEL: @_Z20test_svmopa_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -57,6 +61,7 @@ SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za32_f32( // CHECK-CXX-LABEL: @_Z20test_svmopa_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -69,6 +74,7 @@ SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svsumopa_za32_s8( // CHECK-CXX-LABEL: @_Z21test_svsumopa_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -79,6 +85,7 @@ SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svusmopa_za32_u8( // CHECK-CXX-LABEL: @_Z21test_svusmopa_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu10__SVInt8_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za64_s16( // CHECK-CXX-LABEL: @_Z20test_svmopa_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -25,6 +26,7 @@ SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za64_u16( // CHECK-CXX-LABEL: @_Z20test_svmopa_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -37,6 +39,7 @@ SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmopa_za64_f64( // CHECK-CXX-LABEL: @_Z20test_svmopa_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -49,6 +52,7 @@ SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svsumopa_za64_s16( // CHECK-CXX-LABEL: @_Z22test_svsumopa_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -61,6 +65,7 @@ SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svusmopa_za64_u16( // CHECK-CXX-LABEL: @_Z22test_svusmopa_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu11__SVInt16_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za32_s8( // CHECK-CXX-LABEL: @_Z19test_svmops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -23,6 +24,7 @@ SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za32_u8( // CHECK-CXX-LABEL: @_Z19test_svmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -33,6 +35,7 @@ SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za32_bf16( // CHECK-CXX-LABEL: @_Z21test_svmops_za32_bf16u10__SVBool_tu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -45,6 +48,7 @@ SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za32_f16( // CHECK-CXX-LABEL: @_Z20test_svmops_za32_f16u10__SVBool_tu10__SVBool_tu13__SVFloat16_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -57,6 +61,7 @@ SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za32_f32( // CHECK-CXX-LABEL: @_Z20test_svmops_za32_f32u10__SVBool_tu10__SVBool_tu13__SVFloat32_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -69,6 +74,7 @@ SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svsumops_za32_s8( // CHECK-CXX-LABEL: @_Z21test_svsumops_za32_s8u10__SVBool_tu10__SVBool_tu10__SVInt8_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -79,6 +85,7 @@ SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svusmops_za32_u8( // CHECK-CXX-LABEL: @_Z21test_svusmops_za32_u8u10__SVBool_tu10__SVBool_tu11__SVUint8_tu10__SVInt8_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za64_s16( // CHECK-CXX-LABEL: @_Z20test_svmops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -25,6 +26,7 @@ SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za64_u16( // CHECK-CXX-LABEL: @_Z20test_svmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -37,6 +39,7 @@ SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svmops_za64_f64( // CHECK-CXX-LABEL: @_Z20test_svmops_za64_f64u10__SVBool_tu10__SVBool_tu13__SVFloat64_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -49,6 +52,7 @@ SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svsumops_za64_s16( // CHECK-CXX-LABEL: @_Z22test_svsumops_za64_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -61,6 +65,7 @@ SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svusmops_za64_u16( // CHECK-CXX-LABEL: @_Z22test_svusmops_za64_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu11__SVInt16_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za8_s8( // CHECK-CXX-LABEL: @_Z22test_svread_hor_za8_s8u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -23,6 +24,7 @@ return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za8_s8_1( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za8_s8_1u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -34,6 +36,7 @@ return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 15); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_s16( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za16_s16u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -45,6 +48,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_s16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -57,6 +61,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_s32( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za32_s32u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -68,6 +73,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_s32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -80,6 +86,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_s64( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za64_s64u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -91,6 +98,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_s64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -103,6 +111,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za8_u8( // CHECK-CXX-LABEL: @_Z22test_svread_hor_za8_u8u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -113,6 +122,7 @@ return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za8_u8_1( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za8_u8_1u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -124,6 +134,7 @@ return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 15); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_u16( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za16_u16u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -135,6 +146,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_u16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -147,6 +159,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_u32( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za32_u32u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -158,6 +171,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_u32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -170,6 +184,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_u64( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za64_u64u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -181,6 +196,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_u64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -193,6 +209,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_f16( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za16_f16u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -204,6 +221,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_f16_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -216,6 +234,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_bf16( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za16_bf16u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -227,6 +246,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za16_bf16_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -239,6 +259,7 @@ return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_f32( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za32_f32u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -250,6 +271,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za32_f32_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -262,6 +284,7 @@ return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_f64( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za64_f64u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -273,6 +296,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za64_f64_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -285,6 +309,7 @@ return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s8( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za128_s8u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -295,6 +320,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s8_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za128_s8_1u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -305,6 +331,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s16( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_s16u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -316,6 +343,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s16_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -327,6 +355,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s32( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_s32u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -338,6 +367,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s32_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -349,6 +379,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s64( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_s64u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -360,6 +391,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_s64_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -371,6 +403,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u8( // CHECK-CXX-LABEL: @_Z24test_svread_hor_za128_u8u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -381,6 +414,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u8_1( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za128_u8_1u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -391,6 +425,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u16( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_u16u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -402,6 +437,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u16_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -413,6 +449,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u32( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_u32u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -424,6 +461,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u32_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -435,6 +473,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u64( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_u64u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -446,6 +485,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_u64_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -457,6 +497,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f16( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_f16u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -468,6 +509,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f16_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -479,6 +521,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_bf16( // CHECK-CXX-LABEL: @_Z26test_svread_hor_za128_bf16u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -490,6 +533,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_bf16_1( // CHECK-CXX-LABEL: @_Z28test_svread_hor_za128_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -501,6 +545,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f32( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_f32u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -512,6 +557,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f32_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -523,6 +569,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f64( // CHECK-CXX-LABEL: @_Z25test_svread_hor_za128_f64u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -534,6 +581,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_hor_za128_f64_1( // CHECK-CXX-LABEL: @_Z27test_svread_hor_za128_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -545,6 +593,7 @@ return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za8_s8( // CHECK-CXX-LABEL: @_Z22test_svread_ver_za8_s8u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -555,6 +604,7 @@ return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za8_s8_1( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za8_s8_1u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -566,6 +616,7 @@ return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 15); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_s16( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za16_s16u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -577,6 +628,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_s16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -589,6 +641,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_s32( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za32_s32u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -600,6 +653,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_s32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -612,6 +666,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_s64( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za64_s64u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -623,6 +678,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_s64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -635,6 +691,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za8_u8( // CHECK-CXX-LABEL: @_Z22test_svread_ver_za8_u8u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -645,6 +702,7 @@ return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za8_u8_1( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za8_u8_1u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -656,6 +714,7 @@ return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 15); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_u16( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za16_u16u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -667,6 +726,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_u16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -679,6 +739,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_u32( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za32_u32u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -690,6 +751,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_u32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -702,6 +764,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_u64( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za64_u64u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -713,6 +776,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_u64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -725,6 +789,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_f16( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za16_f16u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -736,6 +801,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_f16_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -748,6 +814,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_bf16( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za16_bf16u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -759,6 +826,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za16_bf16_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -771,6 +839,7 @@ return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice_base, 7); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_f32( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za32_f32u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -782,6 +851,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za32_f32_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -794,6 +864,7 @@ return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice_base, 3); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_f64( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za64_f64u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -805,6 +876,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za64_f64_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -817,6 +889,7 @@ return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice_base, 1); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s8( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za128_s8u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -827,6 +900,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s8_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za128_s8_1u10__SVInt8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -837,6 +911,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s16( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_s16u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -848,6 +923,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s16_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_s16_1u11__SVInt16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -859,6 +935,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s32( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_s32u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -870,6 +947,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s32_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_s32_1u11__SVInt32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -881,6 +959,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s64( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_s64u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -892,6 +971,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_s64_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_s64_1u11__SVInt64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -903,6 +983,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u8( // CHECK-CXX-LABEL: @_Z24test_svread_ver_za128_u8u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -913,6 +994,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u8_1( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za128_u8_1u11__SVUint8_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -923,6 +1005,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u16( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_u16u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -934,6 +1017,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u16_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_u16_1u12__SVUint16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -945,6 +1029,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u32( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_u32u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -956,6 +1041,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u32_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_u32_1u12__SVUint32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -967,6 +1053,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u64( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_u64u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -978,6 +1065,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_u64_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_u64_1u12__SVUint64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -989,6 +1077,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f16( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_f16u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1000,6 +1089,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f16_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_f16_1u13__SVFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1011,6 +1101,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_bf16( // CHECK-CXX-LABEL: @_Z26test_svread_ver_za128_bf16u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1022,6 +1113,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_bf16_1( // CHECK-CXX-LABEL: @_Z28test_svread_ver_za128_bf16_1u14__SVBFloat16_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1033,6 +1125,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f32( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_f32u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1044,6 +1137,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f32_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_f32_1u13__SVFloat32_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1055,6 +1149,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f64( // CHECK-CXX-LABEL: @_Z25test_svread_ver_za128_f64u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: @@ -1066,6 +1161,7 @@ return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base, 0); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svread_ver_za128_f64_1( // CHECK-CXX-LABEL: @_Z27test_svread_ver_za128_f64_1u13__SVFloat64_tu10__SVBool_tj( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_reads.c @@ -0,0 +1,964 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_hor_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za8,_s8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_hor_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_hor_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za8,_u8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_hor_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_s16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_hor_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_u16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_hor_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_f16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_hor_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za16,_bf16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_hor_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_s32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_hor_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_u32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_hor_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za32,_f32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_hor_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_s64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_hor_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_u64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_hor_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za64,_f64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_hor_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_hor_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_hor_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_hor_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_hor_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_hor_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_bf16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_hor_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_hor_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_hor_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_hor_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_s64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_hor_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_u64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_hor_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.horiz.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_hor_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_hor_za128,_f64,_m,)(zd, pg, 15, slice_base, 0); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint8_t test_svread_ver_za8_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za8,_s8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z24test_svread_ver_za8_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint8_t test_svread_ver_za8_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za8,_u8,_m,)(zd, pg, 0, slice_base, 15); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svread_ver_za16_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_s16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svread_ver_za16_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_u16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za16_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_svread_ver_za16_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_f16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za16_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 1, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svread_ver_za16_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za16,_bf16,_m,)(zd, pg, 1, slice_base, 7); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svread_ver_za32_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_s32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svread_ver_za32_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_u32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za32_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 3, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svread_ver_za32_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za32,_f32,_m,)(zd, pg, 3, slice_base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svread_ver_za64_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_s64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svread_ver_za64_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_u64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za64_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sme.read.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 7, i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svread_ver_za64_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za64,_f64,_m,)(zd, pg, 7, slice_base, 1); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_s8_mu10__SVInt8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svread_ver_za128_s8_m(svint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za128_u8_mu11__SVUint8_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv16i8( [[ZD:%.*]], [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svread_ver_za128_u8_m(svuint8_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u8,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s16_mu11__SVInt16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svread_ver_za128_s16_m(svint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u16_mu12__SVUint16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8i16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svread_ver_za128_u16_m(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f16_mu13__SVFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8f16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svread_ver_za128_f16_m(svfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za128_bf16_mu14__SVBFloat16_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv8bf16( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svread_ver_za128_bf16_m(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_bf16,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s32_mu11__SVInt32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svread_ver_za128_s32_m(svint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u32_mu12__SVUint32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4i32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svread_ver_za128_u32_m(svuint32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f32_mu13__SVFloat32_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv4f32( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svread_ver_za128_f32_m(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f32,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_s64_mu11__SVInt64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svread_ver_za128_s64_m(svint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_s64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_u64_mu12__SVUint64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2i64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svread_ver_za128_u64_m(svuint64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_u64,_m,)(zd, pg, 15, slice_base, 0); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z27test_svread_ver_za128_f64_mu13__SVFloat64_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sme.readq.vert.nxv2f64( [[ZD:%.*]], [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svread_ver_za128_f64_m(svfloat64_t zd, svbool_t pg, uint32_t slice_base) { + return SVE_ACLE_FUNC(svread_ver_za128,_f64,_m,)(zd, pg, 15, slice_base, 0); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -1,15 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include - -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif +#include // CHECK-C-LABEL: @test_svst1_hor_za8( // CHECK-CXX-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv( @@ -19,7 +13,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_hor_za8(0, slice_base, 0, pg, ptr); svst1_hor_za8(0, slice_base, 15, pg, ptr); } @@ -33,7 +27,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_hor_za16(0, slice_base, 0, pg, ptr); svst1_hor_za16(1, slice_base, 7, pg, ptr); } @@ -47,7 +41,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_hor_za32(0, slice_base, 0, pg, ptr); svst1_hor_za32(3, slice_base, 3, pg, ptr); } @@ -61,7 +55,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_hor_za64(0, slice_base, 0, pg, ptr); svst1_hor_za64(7, slice_base, 1, pg, ptr); } @@ -74,7 +68,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_hor_za128(0, slice_base, 0, pg, ptr); svst1_hor_za128(15, slice_base, 0, pg, ptr); } @@ -87,7 +81,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_ver_za8(0, slice_base, 0, pg, ptr); svst1_ver_za8(0, slice_base, 15, pg, ptr); } @@ -101,7 +95,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_ver_za16(0, slice_base, 0, pg, ptr); svst1_ver_za16(1, slice_base, 7, pg, ptr); } @@ -115,7 +109,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_ver_za32(0, slice_base, 0, pg, ptr); svst1_ver_za32(3, slice_base, 3, pg, ptr); } @@ -129,7 +123,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_ver_za64(0, slice_base, 0, pg, ptr); svst1_ver_za64(7, slice_base, 1, pg, ptr); } @@ -142,7 +136,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) { svst1_ver_za128(0, slice_base, 0, pg, ptr); svst1_ver_za128(15, slice_base, 0, pg, ptr); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c @@ -1,15 +1,9 @@ // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include - -#ifdef DISABLE_SME_ATTRIBUTES -#define ARM_STREAMING_ATTR -#else -#define ARM_STREAMING_ATTR __attribute__((arm_streaming)) -#endif +#include // CHECK-C-LABEL: @test_svst1_hor_vnum_za8( // CHECK-CXX-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl( @@ -22,7 +16,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum); svst1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum); } @@ -39,7 +33,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum); svst1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum); } @@ -56,7 +50,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum); svst1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum); } @@ -73,7 +67,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum); svst1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum); } @@ -89,7 +83,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum); svst1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum); } @@ -105,7 +99,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum); svst1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum); } @@ -122,7 +116,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum); svst1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum); } @@ -139,7 +133,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum); svst1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum); } @@ -156,7 +150,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum); svst1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum); } @@ -172,7 +166,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]]) // CHECK-NEXT: ret void // -ARM_STREAMING_ATTR void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { +__attribute__((arm_streaming, arm_shared_za)) void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) { svst1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum); svst1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -0,0 +1,98 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_in_streaming_mode_from_normal( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z34test_in_streaming_mode_from_normalv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5:[0-9]+]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_in_streaming_mode_from_normal(void) { + return __arm_in_streaming_mode(); +} + +// +// +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_in_streaming_mode_from_streaming( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z37test_in_streaming_mode_from_streamingv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_in_streaming_mode_from_streaming(void) { + return __arm_in_streaming_mode(); +} + +// CHECK-LABEL: @test_za_disable( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR6:[0-9]+]] +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_za_disablev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR6:[0-9]+]] +// CPP-CHECK-NEXT: ret void +// +void test_za_disable(void) { + __arm_za_disable(); +} + +// CHECK-LABEL: @test_has_sme( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +// CPP-CHECK-LABEL: @_Z12test_has_smev( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR5]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0 +// CPP-CHECK-NEXT: ret i1 [[TOBOOL_I]] +// +bool test_has_sme(void) { + return __arm_has_sme(); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svundef_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svundef_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: ret void +// +void test_svundef_za(void) { + svundef_za(); +} + diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_stores.c @@ -0,0 +1,500 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_hor_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_hor_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_hor_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_hor_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_hor_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_hor_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[BASE:%.*]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_za8(0, slice_base, 15, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[BASE:%.*]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_za16(1, slice_base, 7, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[BASE:%.*]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_za32(3, slice_base, 3, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[BASE:%.*]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_za64(7, slice_base, 1, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[BASE:%.*]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_za128(15, slice_base, 0, pg, base); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], ptr [[TMP3]], i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, int8_t *base) { + svst1_ver_vnum_za8(0, slice_base, 15, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP1]], ptr [[TMP4]], i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, int16_t *base) { + svst1_ver_vnum_za16(1, slice_base, 7, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP1]], ptr [[TMP4]], i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, int32_t *base) { + svst1_ver_vnum_za32(3, slice_base, 3, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP3]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP1]], ptr [[TMP4]], i32 7, i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, int64_t *base) { + svst1_ver_vnum_za64(7, slice_base, 1, pg, base, 3); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svst1_ver_vnum_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPn( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, __int128_t *base) { + svst1_ver_vnum_za128(15, slice_base, 0, pg, base, 3); +} + +// +// +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svstr_vnum_za( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstr_vnum_zajPh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CPP-CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP1]] +// CPP-CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TMP3]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstr_vnum_za(uint32_t slice_base, uint8_t *base) { + svstr_vnum_za(slice_base, 15, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -3,8 +3,9 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svstr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( // CHECK-NEXT: entry: @@ -15,6 +16,7 @@ svstr_vnum_za(slice_base, 0, ptr); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svstr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s -#include +#include #ifdef SME_OVERLOADED_FORMS #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 @@ -13,6 +13,7 @@ #define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za8_s8( // CHECK-CXX-LABEL: @_Z23test_svwrite_hor_za8_s8ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -23,6 +24,7 @@ SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za8_s8_1( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za8_s8_1ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -34,6 +36,7 @@ SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 15, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_s16( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za16_s16ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -45,6 +48,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_s16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za16_s16_1ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -57,6 +61,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_s32( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za32_s32ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -68,6 +73,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_s32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za32_s32_1ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -80,6 +86,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_s64( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za64_s64ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -91,6 +98,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_s64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za64_s64_1ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -103,6 +111,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za8_u8( // CHECK-CXX-LABEL: @_Z23test_svwrite_hor_za8_u8ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -113,6 +122,7 @@ SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za8_u8_1( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za8_u8_1ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -124,6 +134,7 @@ SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 15, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_u16( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za16_u16ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -135,6 +146,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_u16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za16_u16_1ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -147,6 +159,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_u32( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za32_u32ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -158,6 +171,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_u32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za32_u32_1ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -170,6 +184,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_u64( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za64_u64ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -181,6 +196,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_u64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za64_u64_1ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -193,6 +209,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_f16( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za16_f16ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -204,6 +221,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_f16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za16_f16_1ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -216,6 +234,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za16_bf16ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -227,6 +246,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za16_bf16_1ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -239,6 +259,7 @@ SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_f32( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za32_f32ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -250,6 +271,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za32_f32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za32_f32_1ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -262,6 +284,7 @@ SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_f64( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za64_f64ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -273,6 +296,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za64_f64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za64_f64_1ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -285,6 +309,7 @@ SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s8( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za128_s8ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -295,6 +320,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s8_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za128_s8_1ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -305,6 +331,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s16( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_s16ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -316,6 +343,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_s16_1ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -327,6 +355,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s32( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_s32ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -338,6 +367,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_s32_1ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -349,6 +379,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s64( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_s64ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -360,6 +391,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_s64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_s64_1ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -371,6 +403,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u8( // CHECK-CXX-LABEL: @_Z25test_svwrite_hor_za128_u8ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -381,6 +414,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u8_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za128_u8_1ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -391,6 +425,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u16( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_u16ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -402,6 +437,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_u16_1ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -413,6 +449,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u32( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_u32ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -424,6 +461,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_u32_1ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -435,6 +473,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u64( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_u64ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -446,6 +485,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_u64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_u64_1ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -457,6 +497,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f16( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_f16ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -468,6 +509,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_f16_1ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -479,6 +521,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16( // CHECK-CXX-LABEL: @_Z27test_svwrite_hor_za128_bf16ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -490,6 +533,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16_1( // CHECK-CXX-LABEL: @_Z29test_svwrite_hor_za128_bf16_1ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -501,6 +545,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f32( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_f32ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -512,6 +557,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_f32_1ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -523,6 +569,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f64( // CHECK-CXX-LABEL: @_Z26test_svwrite_hor_za128_f64ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -534,6 +581,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_hor_za128_f64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_hor_za128_f64_1ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -545,6 +593,7 @@ SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za8_s8( // CHECK-CXX-LABEL: @_Z23test_svwrite_ver_za8_s8ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -555,6 +604,7 @@ SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za8_s8_1( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za8_s8_1ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -566,6 +616,7 @@ SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 15, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_s16( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za16_s16ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -577,6 +628,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_s16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za16_s16_1ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -589,6 +641,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_s32( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za32_s32ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -600,6 +653,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_s32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za32_s32_1ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -612,6 +666,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_s64( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za64_s64ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -623,6 +678,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_s64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za64_s64_1ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -635,6 +691,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za8_u8( // CHECK-CXX-LABEL: @_Z23test_svwrite_ver_za8_u8ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -645,6 +702,7 @@ SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za8_u8_1( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za8_u8_1ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -656,6 +714,7 @@ SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 15, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_u16( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za16_u16ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -667,6 +726,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_u16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za16_u16_1ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -679,6 +739,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_u32( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za32_u32ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -690,6 +751,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_u32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za32_u32_1ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -702,6 +764,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_u64( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za64_u64ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -713,6 +776,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_u64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za64_u64_1ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -725,6 +789,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_f16( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za16_f16ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -736,6 +801,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_f16_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za16_f16_1ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -748,6 +814,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za16_bf16ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -759,6 +826,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za16_bf16_1ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -771,6 +839,7 @@ SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice_base, 7, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_f32( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za32_f32ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -782,6 +851,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za32_f32_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za32_f32_1ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -794,6 +864,7 @@ SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice_base, 3, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_f64( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za64_f64ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -805,6 +876,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za64_f64_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za64_f64_1ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -817,6 +889,7 @@ SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice_base, 1, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s8( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za128_s8ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -827,6 +900,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s8_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za128_s8_1ju10__SVBool_tu10__SVInt8_t( // CHECK-NEXT: entry: @@ -837,6 +911,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s16( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_s16ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -848,6 +923,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_s16_1ju10__SVBool_tu11__SVInt16_t( // CHECK-NEXT: entry: @@ -859,6 +935,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s32( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_s32ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -870,6 +947,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_s32_1ju10__SVBool_tu11__SVInt32_t( // CHECK-NEXT: entry: @@ -881,6 +959,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s64( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_s64ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -892,6 +971,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_s64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_s64_1ju10__SVBool_tu11__SVInt64_t( // CHECK-NEXT: entry: @@ -903,6 +983,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u8( // CHECK-CXX-LABEL: @_Z25test_svwrite_ver_za128_u8ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -913,6 +994,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u8_1( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za128_u8_1ju10__SVBool_tu11__SVUint8_t( // CHECK-NEXT: entry: @@ -923,6 +1005,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u16( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_u16ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -934,6 +1017,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_u16_1ju10__SVBool_tu12__SVUint16_t( // CHECK-NEXT: entry: @@ -945,6 +1029,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u32( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_u32ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -956,6 +1041,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_u32_1ju10__SVBool_tu12__SVUint32_t( // CHECK-NEXT: entry: @@ -967,6 +1053,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u64( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_u64ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -978,6 +1065,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_u64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_u64_1ju10__SVBool_tu12__SVUint64_t( // CHECK-NEXT: entry: @@ -989,6 +1077,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f16( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_f16ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -1000,6 +1089,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f16_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_f16_1ju10__SVBool_tu13__SVFloat16_t( // CHECK-NEXT: entry: @@ -1011,6 +1101,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16( // CHECK-CXX-LABEL: @_Z27test_svwrite_ver_za128_bf16ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -1022,6 +1113,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16_1( // CHECK-CXX-LABEL: @_Z29test_svwrite_ver_za128_bf16_1ju10__SVBool_tu14__SVBFloat16_t( // CHECK-NEXT: entry: @@ -1033,6 +1125,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f32( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_f32ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -1044,6 +1137,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f32_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_f32_1ju10__SVBool_tu13__SVFloat32_t( // CHECK-NEXT: entry: @@ -1055,6 +1149,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f64( // CHECK-CXX-LABEL: @_Z26test_svwrite_ver_za128_f64ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: @@ -1066,6 +1161,7 @@ SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, 0, pg, zn); } +__attribute__((arm_streaming, arm_shared_za)) // CHECK-C-LABEL: @test_svwrite_ver_za128_f64_1( // CHECK-CXX-LABEL: @_Z28test_svwrite_ver_za128_f64_1ju10__SVBool_tu13__SVFloat64_t( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_writes.c @@ -0,0 +1,965 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// == HORIZONTAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_hor_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_bf16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_s64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_u64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_hor_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_hor_za128,_f64,_m,)(15, slice_base, 0, pg, zn); +} + +// == VERTICAL == + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svwrite_ver_za8_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_m,)(0, slice_base, 15, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za16_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za16_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_m,)(1, slice_base, 7, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za32_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_m,)(3, slice_base, 3, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za64_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_m,)(7, slice_base, 1, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_s8_mju10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s8_m(uint32_t slice_base, svbool_t pg, svint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za128_u8_mju10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], [[PG:%.*]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u8_m(uint32_t slice_base, svbool_t pg, svuint8_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u8,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s16_mju10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s16_m(uint32_t slice_base, svbool_t pg, svint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u16_mju10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u16_m(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f16_mju10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f16_m(uint32_t slice_base, svbool_t pg, svfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za128_bf16_mju10__SVBool_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_bf16_m(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_bf16,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s32_mju10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s32_m(uint32_t slice_base, svbool_t pg, svint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u32_mju10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u32_m(uint32_t slice_base, svbool_t pg, svuint32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f32_mju10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f32_m(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f32,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_s64_mju10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_s64_m(uint32_t slice_base, svbool_t pg, svint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_s64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_u64_mju10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_u64_m(uint32_t slice_base, svbool_t pg, svuint64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_u64,_m,)(15, slice_base, 0, pg, zn); +} + +// +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za128_f64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svwrite_ver_za128_f64_mju10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], [[TMP0]], [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za128_f64_m(uint32_t slice_base, svbool_t pg, svfloat64_t zn) { + SVE_ACLE_FUNC(svwrite_ver_za128,_f64,_m,)(15, slice_base, 0, pg, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c @@ -1,46 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -#include +#include -// CHECK-C-LABEL: @test_svzero_mask_za( -// CHECK-CXX-LABEL: @_Z19test_svzero_mask_zav( +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svzero_mask_za_0( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 0) // CHECK-NEXT: ret void // -void test_svzero_mask_za() { - svzero_mask_za(0); -} - -// CHECK-C-LABEL: @test_svzero_mask_za_1( -// CHECK-CXX-LABEL: @_Z21test_svzero_mask_za_1v( -// CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 176) -// CHECK-NEXT: ret void +// CPP-CHECK-LABEL: @_Z21test_svzero_mask_za_0v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 0) +// CPP-CHECK-NEXT: ret void // -void test_svzero_mask_za_1() { - svzero_mask_za(176); +void test_svzero_mask_za_0(void) { + svzero_mask_za(0); } -// CHECK-C-LABEL: @test_svzero_mask_za_2( -// CHECK-CXX-LABEL: @_Z21test_svzero_mask_za_2v( +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svzero_mask_za_255( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_mask_za_2() { +// CPP-CHECK-LABEL: @_Z23test_svzero_mask_za_255v( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_mask_za_255(void) { svzero_mask_za(255); } -// CHECK-C-LABEL: @test_svzero_za( -// CHECK-CXX-LABEL: @_Z14test_svzero_zav( +__attribute__((arm_streaming_compatible, arm_shared_za)) +// CHECK-LABEL: @test_svzero_za( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) // CHECK-NEXT: ret void // -void test_svzero_za() { +// CPP-CHECK-LABEL: @_Z14test_svzero_zav( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero(i32 255) +// CPP-CHECK-NEXT: ret void +// +void test_svzero_za(void) { svzero_za(); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_add.c @@ -0,0 +1,54 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme-i16i64 -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_s32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddha_za32_u32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_s64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddha_za64_u64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_s32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svaddva_za32_u32_m(4, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_s64_m(8, pn, pm, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svaddva_za64_u64_m(8, pn, pm, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_loads.c @@ -0,0 +1,70 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svld1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svld1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svld1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svld1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svld1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svld1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, const int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 0]}} + svld1_ver_za128(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) +void test_svldr_vnum_za(uint32_t slice_base, const uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svldr_vnum_za(slice_base, 16, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_mop.c @@ -0,0 +1,129 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 \ +// RUN: -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s + +#include + +// == MOPS / MOPA == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_bf16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_u16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmopa_za32_f32_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmopa_za64_f64_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_bf16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f16_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_u16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svmops_za32_f32_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svmops_za64_f64_m(8, pn, pm, zn, zm); +} + +// == MIXED SIGN MOPA / MOPS == + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svsumops_za32_s8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svsumops_za64_s16_m(8, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svusmops_za32_u8_m(4, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svusmops_za64_u16_m(8, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_reads.c @@ -0,0 +1,64 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_tile(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za8_s8_m(zd, pg, 1, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_tile(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_m(zd, pg, 2, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_tile(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 4, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_tile(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_hor_za64_s64_m(zd, pg, 8, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_tile(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za128_bf16_m(zd, pg, 16, slice_base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za8_bad_slice(svint8_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svread_hor_za8_s8_m(zd, pg, 0, slice_base, 16); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za16_bad_slice(svuint16_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za16_u16_m(zd, pg, 1, slice_base, 8); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_ver_za32_bad_slice(svfloat32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_f32_m(zd, pg, 3, slice_base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za64_bad_slice(svint64_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svread_hor_za64_s64_m(zd, pg, 7, slice_base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svread_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za128_s32_m(zd, pg, 15, slice_base, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_stores.c @@ -0,0 +1,70 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_hor_za8(1, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za16_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_hor_za16(2, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(4, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za64(8, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_bad_tile(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_ver_za128(16, slice_base, 0, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za8_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svst1_hor_za8(0, slice_base, 16, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za16_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svst1_ver_za16(0, slice_base, 8, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_hor_za32_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svst1_hor_za32(0, slice_base, 4, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za64_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svst1_ver_za64(0, slice_base, 2, pg, base); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svst1_ver_za128_slice_offset(uint32_t slice_base, svbool_t pg, int8_t *base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svst1_ver_za128(0, slice_base, 1, pg, base); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) +void test_svstr_vnum_za(uint32_t slice_base, uint8_t *base) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svstr_vnum_za(slice_base, 16, base); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_writes.c @@ -0,0 +1,64 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_tile(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za8_s8_m(1, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_tile(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_m(2, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_tile(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(4, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_tile(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_hor_za64_s64_m(8, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_tile(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za128_bf16_m(16, slice_base, 0, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za8_bad_slice(uint32_t slice_base, svbool_t pg, svint8_t zn) { + // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}} + svwrite_hor_za8_s8_m(0, slice_base, 16, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za16_bad_slice(uint32_t slice_base, svbool_t pg, svuint16_t zn) { + // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za16_u16_m(1, slice_base, 8, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_ver_za32_bad_slice(uint32_t slice_base, svbool_t pg, svfloat32_t zn) { + // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_f32_m(3, slice_base, 4, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za64_bad_slice(uint32_t slice_base, svbool_t pg, svint64_t zn) { + // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}} + svwrite_hor_za64_s64_m(7, slice_base, 2, pg, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svwrite_hor_za128_bad_slice(svint32_t zd, svbool_t pg, uint32_t slice_base) { + // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za128_s32_m(15, slice_base, 1, pg, zd); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/negative/acle_sme_zero.c @@ -0,0 +1,10 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_svzero_mask_za(void) { + // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} + svzero_mask_za(256); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c @@ -0,0 +1,1270 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s32j11svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_s64j11svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s32j11svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_s64j11svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svadd_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s32j11svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u32j12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_s64j11svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi2_u64j12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s32j11svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u32j12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_s64j11svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svadd_write_multi4_u64j12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) { + SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Single Vector +// + +// x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,,)(zn, zm); +} + + +// x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,,)(zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_s32j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_s64j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, 6, zn); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_s32j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) { + SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_s64j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svadd_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svadd_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) { + SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, 6, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c @@ -0,0 +1,769 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// SCLAMP_X2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3); +} + + +// SCLAMP_X4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svclamp_single_s8_x4(svint8x4_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s8_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x411svint16x4_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svclamp_single_s16_x4(svint16x4_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x411svint32x4_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svclamp_single_s32_x4(svint32x4_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x411svint64x4_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svclamp_single_s64_x4(svint64x4_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_s64_x4, , )(op1, op2, op3); +} + + + + +// UCLAMP_X2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x211svuint8x2_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svclamp_single_u8_x2(svuint8x2_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u8_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x212svuint16x2_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svclamp_single_u16_x2(svuint16x2_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x212svuint32x2_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svclamp_single_u32_x2(svuint32x2_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x212svuint64x2_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.uclamp.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svclamp_single_u64_x2(svuint64x2_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u64_x2, , )(op1, op2, op3); +} + + +// UCLAMP_X4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svclamp_single_u8_x411svuint8x4_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[OP1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svclamp_single_u8_x4(svuint8x4_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u8_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u16_x412svuint16x4_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svclamp_single_u16_x4(svuint16x4_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u32_x412svuint32x4_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svclamp_single_u32_x4(svuint32x4_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_u64_x412svuint64x4_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.uclamp.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svclamp_single_u64_x4(svuint64x4_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_u64_x4, , )(op1, op2, op3); +} + + + + +// FCLAMP_X2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x213svfloat16x2_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svclamp_single_f16_x2(svfloat16x2_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f16_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x213svfloat32x2_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svclamp_single_f32_x2(svfloat32x2_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f32_x2, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) + +// CHECK-LABEL: @test_svclamp_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x213svfloat64x2_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fclamp.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svclamp_single_f64_x2(svfloat64x2_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f64_x2, , )(op1, op2, op3); +} + + +// FCLAMP_X4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f16_x413svfloat16x4_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[OP1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svclamp_single_f16_x4(svfloat16x4_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f16_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f32_x413svfloat32x4_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[OP1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f32_x4, , )(op1, op2, op3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svclamp_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svclamp_single_f64_x413svfloat64x4_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[OP1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fclamp.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create2_bool.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svcreate2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svcreate2_s8u10__SVBool_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svboolx2_t test_svcreate2_s8(svbool_t x0, svbool_t x1) +{ + return SVE_ACLE_FUNC(svcreate2,_b8,,)(x0, x1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_create4_bool.c @@ -0,0 +1,37 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svcreate4_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z17test_svcreate4_b8u10__SVBool_tu10__SVBool_tu10__SVBool_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP3]] +// +svboolx4_t test_svcreate4_b8(svbool_t x0, svbool_t x1, svbool_t x2, svbool_t x4) +{ + return SVE_ACLE_FUNC(svcreate4,_b8,,)(x0, x1, x2, x4); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c @@ -0,0 +1,518 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvt_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvt.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z15test_cvt_f16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvt.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_cvt_f16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvt_f16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvt_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvt.x2( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z16test_cvt_bf16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvt.x2( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvt_bf16,_f32_x2,,)(zn); +} + + +// x2 +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtu.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvtu.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvts.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fcvts.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_u32_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ucvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ucvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_u32,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_s32_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.scvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.scvtf.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn){ + return SVE_ACLE_FUNC(svcvt_s32,_f32_x2,,)(zn); +} + +// x4 +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvtu.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvtu.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_f32_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvts.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_f32_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fcvts.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_f32,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_u32_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ucvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_u32_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ucvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_u32,_f32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcvt_s32_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.scvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z21test_svcvt_s32_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.scvtf.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svcvt_s32_f32_x4(svfloat32x4_t zn){ + return SVE_ACLE_FUNC(svcvt_s32,_f32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_s16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_qcvt_s16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_s16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvt_u16_u32_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvt_u16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u8_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_u8_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvt_u8_u32_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u8,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_u64_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvt_u16_u64_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_u64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_s8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_qcvt_s8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_s8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_s16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_s16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvt.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_qcvt_s16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_s16,_s64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvt_u8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvt_u16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvt_u16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtu.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c @@ -0,0 +1,253 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvtn_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvtn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z16test_cvtn_f16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fcvtn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat16_t test_cvtn_f16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvtn_f16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_cvtn_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvtn.x2( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_cvtn_bf16_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.bfcvtn.x2( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_cvtn_bf16_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svcvtn_bf16,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_s16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_s16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_u16_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_u32_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_u32_x2,,)(zn); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_qcvtn_u16_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_s32_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x2.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_s32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u8_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvtn_u8_u32_x412svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvtn_u8_u32_x4(svuint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u8,_u32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u16_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_u64_x412svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.uqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvtn_u16_u64_x4(svuint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_u64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_s8_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_qcvtn_s8_s32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8_t test_qcvtn_s8_s32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_s16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_s16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16_t test_qcvtn_s16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_s16,_s64_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u8_32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_qcvtn_u8_32_x411svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8_t test_qcvtn_u8_32_x4(svint32x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u8,_s32_x4,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_qcvtn_u16_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z21test_qcvtn_u16_s64_x411svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.sqcvtun.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16_t test_qcvtn_u16_s64_x4(svint64x4_t zn) { + return SVE_ACLE_FUNC(svqcvtn_u16,_s64_x4,,)(zn); +} + + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c @@ -0,0 +1,339 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (half) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_f16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_f16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +// +// Multi, multi (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (bfloat) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_bf16,_vg1x4)(slice_base, 7, zn, zm, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c @@ -0,0 +1,290 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// FRINTA + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrinta_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frinta.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frinta.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrinta,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrinta_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frinta.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frinta.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrinta,_f32_x4,,)(zn); +} + +// FRINTM + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintam_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintm.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svfrintam_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintm.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintm,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintm_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintm_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintm,_f32_x4,,)(zn); +} + +// FRINTN + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintn_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintn.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintn,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintn_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintn,_f32_x4,,)(zn); +} + +// FRINTP + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintp_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintp.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x213svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.frintp.x2.nxv4f32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) { + return SVE_ACLE_FUNC(svrintp,_f32_x2,,)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svfrintp_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintp.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x413svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.frintp.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svfrintp_f32_x4(svfloat32x4_t zn) { + return SVE_ACLE_FUNC(svrintp,_f32_x4,,)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get2-bool.c @@ -0,0 +1,45 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svget2_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget2_b8_010svboolx2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget2_b8_0(svboolx2_t tuple) +{ + return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 0); +} + +// CHECK-LABEL: @test_svget2_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget2_b8_110svboolx2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget2_b8_1(svboolx2_t tuple) +{ + return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_get4-bool.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_010svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_0(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 0); +} + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_110svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_1(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 1); +} + +// NOTE: For these tests clang converts the struct parameter into +// several parameters, one for each member of the original struct. +// CHECK-LABEL: @test_svget4_b8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svget4_b8_310svboolx4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svget4_b8_3(svboolx4_t tuple) +{ + return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c @@ -0,0 +1,1205 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, multi (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) { + SVE_ACLE_FUNC(svdot,,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_single_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot,_single,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm); +} + + + + + + +// +// Multi, indexed (unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_u16,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_u16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +// +// Multi, indexed (signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s16,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_s16,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svdot_lane,,_za64,_s16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +// +// Multi, multi (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_multi_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x2_u8j11svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) { + SVE_ACLE_FUNC(svusdot,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_multi_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x4_u8j11svuint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) { + SVE_ACLE_FUNC(svusdot,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, single (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_single_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot,_single,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_single_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot,_single,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (unsigned by signed) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_lane_za32_vg1x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot_lane,,_za32,_u8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusdot_lane_za32_vg1x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svusdot_lane,,_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +// +// Multi, single (signed by unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_single_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot,_single,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_single_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot,_single,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm); +} + + +// +// Multi, indexed (signed by unsigned) +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_lane_za32_vg1x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot_lane,,_za32,_s8,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsudot_lane_za32_vg1x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svsudot_lane,,_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c @@ -0,0 +1,52 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// LDR ZT0 + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +// CHECK-LABEL: @test_svldr_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z13test_svldr_ztPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svldr_zt(const void *base) { + svldr_zt(0, base); +} + +// STR ZT0 + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +// CHECK-LABEL: @test_svstr_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z13test_svstr_ztPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstr_zt(void *base) { + svstr_zt(0, base); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c @@ -0,0 +1,58 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return svluti2_lane_zt_u8(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return svluti2_lane_zt_u16(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return svluti2_lane_zt_u32(0, zn, 2); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c @@ -0,0 +1,91 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u8,_x2,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, [[ZN:%.*]], i32 7) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, [[ZN:%.*]], i32 7) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u16,_x2,)(0, zn, 7); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, [[ZN:%.*]], i32 7) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, [[ZN:%.*]], i32 7) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u32,_x2,)(0, zn, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -0,0 +1,117 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u8,_x4,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti2_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, [[ZN:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, [[ZN:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svluti2_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u16,_x4,)(0, zn, 3); +} + + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + + +// CHECK-LABEL: @test_svluti2_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, [[ZN:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti2_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, [[ZN:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svluti2_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti2_lane_zt,_u32,_x4,)(0, zn, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c @@ -0,0 +1,57 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti4_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) { + return svluti4_lane_zt_u8(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return svluti4_lane_zt_u16(0, zn, 2); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: @test_svluti4_lane_zt_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, [[ZN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return svluti4_lane_zt_u32(0, zn, 2); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c @@ -0,0 +1,96 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u8 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z23test_svluti4_lane_zt_u8u11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u8,_x2,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, [[ZN]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, [[ZN]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u16,_x2,)(0, zn, 3); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, [[ZN]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, [[ZN]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u32,_x2,)(0, zn, 3); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c @@ -0,0 +1,86 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u16u12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svluti4_lane_zt_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u16,_x4,)(0, zn, 0); +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, [[ZN]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u32u12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, [[ZN]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svluti4_lane_zt_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svluti4_lane_zt,_u32,_x4,)(0, zn, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c @@ -0,0 +1,1607 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svmax_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svmax_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svmax_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svmax_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svmax_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svmax_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svmax_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umax.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svmax_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmax_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmax_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmax_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f64_x2)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svmax_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svmax_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svmax_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svmax_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmax_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svmax_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svmax_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svmax_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svmax_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmax_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmax_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmax_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmax_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmax,_single_f64_x4)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svmax_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svmax_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svmax_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svmax_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svmax_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svmax_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svmax_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umax.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svmax_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmax_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmax_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmax.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmax_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmax,_f64_x2)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svmax_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svmax_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svmax_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svmax_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmax_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svmax_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svmax_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svmax_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svmax_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmax_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmax_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmax_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmax_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmax,_f64_x4)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c @@ -0,0 +1,456 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmaxnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmaxnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmaxnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmaxnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmaxnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svmaxnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmaxnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_single_f64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmaxnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmaxnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmaxnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmaxnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmaxnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmaxnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmaxnm_multi_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svmaxnm_multi_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmaxnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmaxnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmaxnm,_f64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c @@ -0,0 +1,1607 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svmin_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svmin_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svmin_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.smin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svmin_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svmin_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svmin_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svmin_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.umin.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svmin_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svmin_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svmin_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svmin_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f64_x2)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svmin_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svmin_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svmin_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svmin_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z23test_svmin_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svmin_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svmin_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svmin_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svmin_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svmin_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svmin_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svmin_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svmin_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svmin,_single_f64_x4)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svmin_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svmin_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svmin_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.smin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svmin_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_s64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svmin_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u8_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svmin_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svmin_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.umin.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svmin_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_u64_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svmin_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f16_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svmin_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f32_x2)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fmin.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svmin_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svmin,_f64_x2)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svmin_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svmin_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svmin_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svmin_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_s64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svmin_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svmin_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u8_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svmin_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svmin_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svmin_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_u64_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svmin_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f16_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svmin_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f32_x4)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svmin_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svmin_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svmin,_f64_x4)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c @@ -0,0 +1,456 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x213svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv8f16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svminnm_single_f16_x2(svfloat16x2_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x213svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv4f32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svminnm_single_f32_x2(svfloat32x2_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x213svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.single.x2.nxv2f64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svminnm_single_f64_x2(svfloat64x2_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f16_x413svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat16x4_t test_svminnm_single_f16_x4(svfloat16x4_t zdn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f32_x413svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat32x4_t test_svminnm_single_f32_x4(svfloat32x4_t zdn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_single_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z26test_svminnm_single_f64_x413svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.single.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svfloat64x4_t test_svminnm_single_f64_x4(svfloat64x4_t zdn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svminnm,_single_f64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x213svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svminnm_multi_f16_x2(svfloat16x2_t zdn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x213svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svminnm_multi_f32_x2(svfloat32x2_t zdn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x213svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.fminnm.x2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svminnm_multi_f64_x2(svfloat64x2_t zdn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svminnm,_f64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f16_x413svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svminnm_multi_f16_x4(svfloat16x4_t zdn, svfloat16x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f32_x413svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svminnm_multi_f32_x4(svfloat32x4_t zdn, svfloat32x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svminnm_multi_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z25test_svminnm_multi_f64_x413svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.fminnm.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svminnm_multi_f64_x4(svfloat64x4_t zdn, svfloat64x4_t zm) { + return SVE_ACLE_FUNC(svminnm,_f64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c @@ -0,0 +1,334 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmla,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmla_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c @@ -0,0 +1,794 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane,,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c @@ -0,0 +1,1945 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single x 1 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlal_single_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlal_single_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x1)(slice_base, 12, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlsl_single_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x1)(slice_base, 12, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_uvmlsl_single_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x1)(slice_base, 12, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_s8ju11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x1_s8(uint32_t slice_base, svuint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x1)(slice_base, 12, zn, zm); +} + +// +// Single x 2 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base, 4, zn, zm); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumla_single_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_u8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumla_single_x2_u8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla,_single,_za32,_u8,_vg4x2)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_s8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x2)(slice_base, 4, zn, zm); +} + +// +// Single x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmla_single_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base, 4, zn, zm); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsumla_single_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_u8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsumla_single_x4_u8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla,_single,_za32,_u8,_vg4x4)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_single_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_s8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_single_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla,_single,_za32,_s8,_vg4x4)(slice_base, 4, zn, zm); +} + +// +// Multi x 2 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlal_multi_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_s8j11svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlal_multi_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) +{ + SVE_ACLE_FUNC(svusmla_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm); +} + +// +// Multi x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlal_multi_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_mlsl_multi_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlal_multi_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_s8j11svuint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_usmlal_multi_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) +{ + SVE_ACLE_FUNC(svusmla_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm); +} + +// +// Indexed x 1 +// + +// SMLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +// SMLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, 12, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x1_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_s8ju11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x1_s8(uint32_t slice_base, svuint8_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, 12, zn, zm, 15); +} + +// +// Indexed x 2 +// + +// SMLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +// SMLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, 4, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_s8j11svuint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x2_s8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, 4, zn, zm, 15); +} + +// +// Indexed x 4 +// + +// MLAL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlal_lane_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +// MLSL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_smlsl_lane_x4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, 4, zn, zm, 7); +} + +// SUMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_sumlall_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) +{ + SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} + +// USMLALL + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_usmlall_lane_x4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 15) +// CPP-CHECK-NEXT: ret void +// +void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) +{ + SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, 4, zn, zm, 15); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c @@ -0,0 +1,334 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f32j13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f32j13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za32,_f32,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f64j13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f64j13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) { + SVE_ACLE_FUNC(svmls,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi, indexed +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x2)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) { + SVE_ACLE_FUNC(svmls_lane,,_za64,_f64,_vg1x4)(slice_base, 7, zn, zm, 1); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c @@ -0,0 +1,794 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Multi, multi +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) +{ + SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, 6, zn, zm); +} + +// +// Multi, single +// +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm); +} + +// +// Multi, indexed +// + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane1_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SLICE_BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[TMP0]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x1)(slice_base, 14, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x2)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_f16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_bf16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_u16,_vg2x4)(slice_base, 6, zn, zm, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmls_lane4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret void +// +void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) +{ + SVE_ACLE_FUNC(svmls_lane,,_za32,_s16,_vg2x4)(slice_base, 6, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// MOPA + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmopa_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_s16,_m,)(3, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmopa_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmopa_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmopa_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmopa_za32,_u16,_m,)(3, pn, pm, zn, zm); +} + +// MOPS + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmops_s16u10__SVBool_tu10__SVBool_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_s16,_m,)(3, pn, pm, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svmops_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svmops_u16u10__SVBool_tu10__SVBool_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmops_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svmops_za32,_u16,_m,)(3, pn, pm, zn, zm); +} + +// BMOPA + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svbmopa_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svbmopa_u16u10__SVBool_tu10__SVBool_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svbmopa_u16(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svbmopa_za32,_u32,_m,)(3, pn, pm, zn, zm); +} + +// BMOPS + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svbmops_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svbmops_u16u10__SVBool_tu10__SVBool_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PM:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, [[TMP0]], [[TMP1]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svbmops_u16(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svbmops_za32,_u32,_m,)(3, pn, pm, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c @@ -0,0 +1,1547 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) { + return svread_ver_za8_u8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) { + return svread_ver_za8_s8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) { + return svread_hor_za8_u8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) { + return svread_hor_za8_s8_vg2(0, base, 14); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) { + return svread_hor_za8_u8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) { + return svread_hor_za8_s8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) { + return svread_ver_za8_u8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) { + return svread_ver_za8_s8_vg4(0, base, 12); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) { + return svread_hor_za16_u16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) { + return svread_hor_za16_bf16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) { + return svread_hor_za16_f16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) { + return svread_hor_za16_s16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) { + return svread_ver_za16_u16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) { + return svread_ver_za16_bf16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) { + return svread_ver_za16_f16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) { + return svread_ver_za16_s16_vg2(1, base, 6); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) { + return svread_hor_za16_u16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_hor_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) { + return svread_hor_za16_bf16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) { + return svread_hor_za16_f16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) { + return svread_hor_za16_s16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_u16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) { + return svread_ver_za16_u16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z29test_svread_ver_za16_bf16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) { + return svread_ver_za16_bf16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_f16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) { + return svread_ver_za16_f16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za16_s16_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) { + return svread_ver_za16_s16_vg4(1, base, 4); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) { + return svread_hor_za32_u32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) { + return svread_hor_za32_f32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) { + return svread_hor_za32_s32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) { + return svread_ver_za32_u32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) { + return svread_ver_za32_f32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) { + return svread_ver_za32_s32_vg2(3, base, 2); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) { + return svread_hor_za32_u32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) { + return svread_hor_za32_f32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) { + return svread_hor_za32_s32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_u32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) { + return svread_ver_za32_u32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_f32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) { + return svread_ver_za32_f32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za32_s32_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) { + return svread_ver_za32_s32_vg4(3, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) { + return svread_hor_za64_u64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) { + return svread_hor_za64_f64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) { + return svread_hor_za64_s64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) { + return svread_ver_za64_u64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) { + return svread_ver_za64_f64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) { + return svread_ver_za64_s64_vg2(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) { + return svread_hor_za64_u64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) { + return svread_hor_za64_f64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_hor_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) { + return svread_hor_za64_s64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_u64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) { + return svread_ver_za64_u64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_f64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) { + return svread_ver_za64_f64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z28test_svread_ver_za64_s64_vg4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) { + return svread_ver_za64_s64_vg4(7, base, 0); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) { + return svread_za64_u64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) { + return svread_za64_f64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x2j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) { + return svread_za64_s64_vg1x2(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_u64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) { + return svread_za64_u64_vg1x4(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_f64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) { + return svread_za64_f64_vg1x4(base, 7); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svread_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z26test_svread_za64_s64_vg1x4j( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) { + return svread_za64_s64_vg1x4(base, 7); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c @@ -0,0 +1,47 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin.§ +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svreinterpret_svbool_svcnt( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svbool_svcntu11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) +{ + return SVE_ACLE_FUNC(svreinterpret,_b,,)(cnt); +} + +// CHECK-LABEL: @test_svreinterpret_svcnt_svbool( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svcnt_svboolu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svreinterpret_svcnt_svbool(svbool_t pg) +{ + return SVE_ACLE_FUNC(svreinterpret,_c,,)(pg); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_rshl.c @@ -0,0 +1,1176 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svrshl_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svrshl_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svrshl_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svrshl_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s64_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x211svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svrshl_single_u8_x2(svuint8x2_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x212svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svrshl_single_u16_x2(svuint16x2_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svrshl_single_u32_x2(svuint32x2_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x212svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svrshl_single_u64_x2(svuint64x2_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svrshl_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svrshl_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svrshl_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svrshl_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_s64_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_single_u8_x411svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svrshl_single_u8_x4(svuint8x4_t zdn, svuint8_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u16_x412svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svrshl_single_u16_x4(svuint16x4_t zdn, svuint16_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u32_x412svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svrshl_single_u32_x4(svuint32x4_t zdn, svuint32_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_single_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z25test_svrshl_single_u64_x412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svrshl_single_u64_x4(svuint64x4_t zdn, svuint64_t zm) { + return SVE_ACLE_FUNC(svrshl,_single_u64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svrshl_multi_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svrshl_multi_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svrshl_multi_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.srshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svrshl_multi_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_s64_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x211svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svrshl_multi_u8_x2(svuint8x2_t zdn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x212svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svrshl_multi_u16_x2(svuint16x2_t zdn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x212svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svrshl_multi_u32_x2(svuint32x2_t zdn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x212svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.urshl.x2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svrshl_multi_u64_x2(svuint64x2_t zdn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svrshl,_u64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svrshl_multi_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svrshl_multi_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svrshl_multi_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.srshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svrshl_multi_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_s64_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z23test_svrshl_multi_u8_x411svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svrshl_multi_u8_x4(svuint8x4_t zdn, svuint8x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u16_x412svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svrshl_multi_u16_x4(svuint16x4_t zdn, svuint16x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u32_x412svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svrshl_multi_u32_x4(svuint32x4_t zdn, svuint32x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svrshl_multi_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z24test_svrshl_multi_u64_x412svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.urshl.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svrshl_multi_u64_x4(svuint64x4_t zdn, svuint64x4_t zm) { + return SVE_ACLE_FUNC(svrshl,_u64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set2-bool.c @@ -0,0 +1,47 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svset2_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset2_b8_010svboolx2_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx2_t test_svset2_b8_0(svboolx2_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 0, x); +} + +// CHECK-LABEL: @test_svset2_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset2_b8_110svboolx2_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx2_t test_svset2_b8_1(svboolx2_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 1, x); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_set4-bool.c @@ -0,0 +1,62 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + + +// CHECK-LABEL: @test_svset4_b8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_010svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_0(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 0, x); +} + +// CHECK-LABEL: @test_svset4_b8_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_110svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_1(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 1, x); +} + +// CHECK-LABEL: @test_svset4_b8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svset4_b8_310svboolx4_tu10__SVBool_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svboolx4_t test_svset4_b8_3(svboolx4_t tuple, svbool_t x) +{ + return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 3, x); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c @@ -0,0 +1,600 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// Single, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z27test_svsqdmulh_single_s8_x210svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svsqdmulh_single_s8_x2(svint8x2_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s16_x211svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svsqdmulh_single_s16_x2(svint16x2_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s32_x211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svsqdmulh_single_s32_x2(svint32x2_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s64_x211svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.single.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svsqdmulh_single_s64_x2(svint64x2_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s64_x2,,,)(zdn, zm); +} + +// Single, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z27test_svsqdmulh_single_s8_x410svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svsqdmulh_single_s8_x4(svint8x4_t zdn, svint8_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s16_x411svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svsqdmulh_single_s16_x4(svint16x4_t zdn, svint16_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s32_x411svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svsqdmulh_single_s32_x4(svint32x4_t zdn, svint32_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_single_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svsqdmulh_single_s64_x411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.single.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svsqdmulh_single_s64_x4(svint64x4_t zdn, svint64_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_single_s64_x4,,,)(zdn, zm); +} + +// Multi, x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svsqdmulh_s8_x210svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svsqdmulh_s8_x2(svint8x2_t zdn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s8_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s16_x211svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svsqdmulh_s16_x2(svint16x2_t zdn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s16_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s32_x211svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svsqdmulh_s32_x2(svint32x2_t zdn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s32_x2,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s64_x211svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sqdmulh.vgx2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svsqdmulh_s64_x2(svint64x2_t zdn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s64_x2,,,)(zdn, zm); +} + +// Multi, x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z20test_svsqdmulh_s8_x410svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZDN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZM]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svsqdmulh_s8_x4(svint8x4_t zdn, svint8x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s8_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s16_x411svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZDN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZM]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svsqdmulh_s16_x4(svint16x4_t zdn, svint16x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s16_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s32_x411svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZDN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svsqdmulh_s32_x4(svint32x4_t zdn, svint32x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s32_x4,,,)(zdn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsqdmulh_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z21test_svsqdmulh_s64_x411svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZDN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svsqdmulh_s64_x4(svint64x4_t zdn, svint64x4_t zm) { + return SVE_ACLE_FUNC(svsqdmulh,_s64_x4,,,)(zdn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c @@ -0,0 +1,434 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// +// Single-Multi +// + +// x2 +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u32j12svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single2_u64j12svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u32j12svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z28test_svsub_write_single4_u64j12svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) { + SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Multi-Multi +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u32j12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi2_u64j12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, 7, zn, zm); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u32j12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZM]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, 7, zn, zm); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_write_multi4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svsub_write_multi4_u64j12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZM]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[TMP8]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) { + SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, 7, zn, zm); +} + +// +// Accumulate to ZA +// + +// x2 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x2_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_f32j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x2_u32j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x2_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_f64j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, 6, zn); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x2_u64j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, 6, zn); +} + +// x4 + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x4_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_f32j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za32_vg1x4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za32_vg1x4_u32j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) { + SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x4_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_f64j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, 6, zn); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsub_za64_vg1x4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z25test_svsub_za64_vg1x4_u64j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) { + SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, 6, zn); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c @@ -0,0 +1,252 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_bf16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svvdot_lane_za32_bf16_vg1x2j14svbfloat16x2_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_bf16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_f16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_f16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_s16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_s16_vg1x2j11svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_s16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_u16_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_u16_vg1x2j12svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[TMP2]], [[TMP0]], [[TMP1]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_u16,_vg1x2)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_s8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za32_u8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za64_s16_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_s16_vg1x4j11svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za64,_s16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svvdot_lane_za64_u16_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_u16_vg1x4j12svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 1) +// CPP-CHECK-NEXT: ret void +// +void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) { + SVE_ACLE_FUNC(svvdot_lane_za64,_u16,_vg1x4)(slice_base, 7, zn, zm, 1); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svsuvdot_lane_za32_s8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svsuvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) { + SVE_ACLE_FUNC(svsuvdot_lane_za32,_s8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svusvdot_lane_za32_u8_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svusvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]], i32 3) +// CPP-CHECK-NEXT: ret void +// +void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) { + SVE_ACLE_FUNC(svusvdot_lane_za32,_u8,_vg1x4)(slice_base, 7, zn, zm, 3); +} + diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c @@ -0,0 +1,1222 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg2j11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg2j10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 14 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg2,)(0, base, 14, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_hor_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_u8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_u8_vg4j11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za8_s8_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_ver_za8_s8_vg4j10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[VAL]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 12 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg4,)(0, base, 12, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg2j12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg2j14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg2j13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg2j11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 6 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg2,)(1, base, 6, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_hor_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_u16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_u16_vg4j12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_bf16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svwrite_ver_za16_bf16_vg4j14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_f16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_f16_vg4j13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za16_s16_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za16_s16_vg4j11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[VAL]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 4 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg4,)(1, base, 4, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg2j12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg2j13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg2j11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 2 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg2,)(3, base, 2, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_u32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_u32_vg4j12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_f32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_f32_vg4j13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za32_s32_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za32_s32_vg4j11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[VAL]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg4,)(3, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg2,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_hor_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_hor_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_u64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_u64_vg4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_f64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_f64_vg4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_ver_za64_s64_vg4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svwrite_ver_za64_s64_vg4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg4,)(7, base, 0, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x2j12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x2j13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x2j11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[TMP2]], [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x2,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_u64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_u64_vg1x4j12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x4,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_f64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_f64_vg1x4j13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x4,)(base, 7, val); +} + +__attribute__((arm_streaming, arm_shared_za)) +// CHECK-LABEL: @test_svwrite_za64_s64_vg1x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svwrite_za64_s64_vg1x4j11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[VAL]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = add i32 [[BASE:%.*]], 7 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[TMP4]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svwrite_za64_s64_vg1x4(uint32_t base, svint64x4_t val) { + SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x4,)(base, 7, val); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1) A1 +#else +#define SVE_ACLE_FUNC(A1) A1 +#endif + +// CHECK-LABEL: @test_svzero_zt( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.zt(i32 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svzero_ztv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.zero.zt(i32 0) +// CPP-CHECK-NEXT: ret void +// +__attribute__((arm_shared_za)) +void test_svzero_zt(void) { + svzero_zt(0); +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sve_add.c @@ -0,0 +1,556 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// +// Multi-Single Vector +// + +// x2 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_s810svint8x2_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint8x2_t test_svadd_vector_single2_s8(svint8x2_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single2_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv16i8( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x2_t test_svadd_vector_single2_u8(svuint8x2_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s1611svint16x2_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svadd_vector_single2_s16(svint16x2_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u1612svuint16x2_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv8i16( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svadd_vector_single2_u16(svuint16x2_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s3211svint32x2_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svadd_vector_single2_s32(svint32x2_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u3212svuint32x2_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv4i32( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svadd_vector_single2_u32(svuint32x2_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x2,,)(zn, zm); +} + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_s6411svint64x2_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svadd_vector_single2_s64(svint64x2_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x2,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single2_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single2_u6412svuint64x2_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.add.single.x2.nxv2i64( [[TMP0]], [[TMP1]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svadd_vector_single2_u64(svuint64x2_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x2,,)(zn, zm); +} + + +// x4 + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_s810svint8x4_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint8x4_t test_svadd_vector_single4_s8(svint8x4_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s8_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z28test_svadd_vector_single4_u811svuint8x4_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP8]], [[TMP9]], i64 32) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint8x4_t test_svadd_vector_single4_u8(svuint8x4_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u8_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s1611svint16x4_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint16x4_t test_svadd_vector_single4_s16(svint16x4_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s16_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u1612svuint16x4_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 16) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint16x4_t test_svadd_vector_single4_u16(svuint16x4_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u16_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s3211svint32x4_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint32x4_t test_svadd_vector_single4_s32(svint32x4_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s32_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u3212svuint32x4_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 8) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint32x4_t test_svadd_vector_single4_u32(svuint32x4_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u32_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_s6411svint64x4_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svint64x4_t test_svadd_vector_single4_s64(svint64x4_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_s64_x4,,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svadd_vector_single4_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CHECK-NEXT: ret [[TMP12]] +// +// CPP-CHECK-LABEL: @_Z29test_svadd_vector_single4_u6412svuint64x4_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.add.single.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP4]], 2 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 4) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP4]], 3 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP12]] +// +svuint64x4_t test_svadd_vector_single4_u64(svuint64x4_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svadd,_single_u64_x4,,)(zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c @@ -0,0 +1,34 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s + +// FIXME: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0. +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// Test that svreinterpret is inlined (because it should be streaming-compatible) +__attribute__((target("sme"))) +// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming { + return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op); +} + diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c @@ -14,20 +14,20 @@ #else #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst2_bf16( +// CHECK-LABEL: @test_svst2_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t( +// CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) @@ -35,22 +35,22 @@ return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_bf16( +// CHECK-LABEL: @test_svst2_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t( +// CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c @@ -14,14 +14,14 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s8( +// CHECK-LABEL: @test_svst2_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst2_s8u10__SVBool_tPa10svint8x2_t( +// CPP-CHECK-LABEL: @_Z13test_svst2_s8u10__SVBool_tPa10svint8x2_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) @@ -33,20 +33,20 @@ return SVE_ACLE_FUNC(svst2,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s16( +// CHECK-LABEL: @test_svst2_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data) @@ -54,20 +54,20 @@ return SVE_ACLE_FUNC(svst2,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s32( +// CHECK-LABEL: @test_svst2_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data) @@ -75,20 +75,20 @@ return SVE_ACLE_FUNC(svst2,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_s64( +// CHECK-LABEL: @test_svst2_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data) @@ -96,14 +96,14 @@ return SVE_ACLE_FUNC(svst2,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u8( +// CHECK-LABEL: @test_svst2_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst2_u8u10__SVBool_tPh11svuint8x2_t( +// CPP-CHECK-LABEL: @_Z13test_svst2_u8u10__SVBool_tPh11svuint8x2_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) @@ -115,20 +115,20 @@ return SVE_ACLE_FUNC(svst2,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u16( +// CHECK-LABEL: @test_svst2_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data) @@ -136,20 +136,20 @@ return SVE_ACLE_FUNC(svst2,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u32( +// CHECK-LABEL: @test_svst2_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data) @@ -157,20 +157,20 @@ return SVE_ACLE_FUNC(svst2,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_u64( +// CHECK-LABEL: @test_svst2_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data) @@ -178,20 +178,20 @@ return SVE_ACLE_FUNC(svst2,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f16( +// CHECK-LABEL: @test_svst2_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data) @@ -199,20 +199,20 @@ return SVE_ACLE_FUNC(svst2,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f32( +// CHECK-LABEL: @test_svst2_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data) @@ -220,20 +220,20 @@ return SVE_ACLE_FUNC(svst2,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_f64( +// CHECK-LABEL: @test_svst2_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t( +// CPP-CHECK-LABEL: @_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP1]], [[TMP2]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data) @@ -241,20 +241,20 @@ return SVE_ACLE_FUNC(svst2,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s8( +// CHECK-LABEL: @test_svst2_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t( +// CPP-CHECK-LABEL: @_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data) @@ -262,22 +262,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s16( +// CHECK-LABEL: @test_svst2_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data) @@ -285,22 +285,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s32( +// CHECK-LABEL: @test_svst2_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data) @@ -308,22 +308,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s64( +// CHECK-LABEL: @test_svst2_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data) @@ -331,20 +331,20 @@ return SVE_ACLE_FUNC(svst2_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u8( +// CHECK-LABEL: @test_svst2_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t( +// CPP-CHECK-LABEL: @_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data) @@ -352,22 +352,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u16( +// CHECK-LABEL: @test_svst2_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data) @@ -375,22 +375,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u32( +// CHECK-LABEL: @test_svst2_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data) @@ -398,22 +398,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u64( +// CHECK-LABEL: @test_svst2_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data) @@ -421,22 +421,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f16( +// CHECK-LABEL: @test_svst2_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data) @@ -444,22 +444,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f32( +// CHECK-LABEL: @test_svst2_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data) @@ -467,22 +467,22 @@ return SVE_ACLE_FUNC(svst2_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f64( +// CHECK-LABEL: @test_svst2_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t( +// CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c @@ -15,22 +15,22 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst3_bf16( +// CHECK-LABEL: @test_svst3_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t( +// CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) @@ -38,24 +38,24 @@ return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_bf16( +// CHECK-LABEL: @test_svst3_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t( +// CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c @@ -14,7 +14,7 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s8( +// CHECK-LABEL: @test_svst3_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -22,7 +22,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst3_s8u10__SVBool_tPa10svint8x3_t( +// CPP-CHECK-LABEL: @_Z13test_svst3_s8u10__SVBool_tPa10svint8x3_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -35,22 +35,22 @@ return SVE_ACLE_FUNC(svst3,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s16( +// CHECK-LABEL: @test_svst3_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data) @@ -58,22 +58,22 @@ return SVE_ACLE_FUNC(svst3,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s32( +// CHECK-LABEL: @test_svst3_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data) @@ -81,22 +81,22 @@ return SVE_ACLE_FUNC(svst3,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_s64( +// CHECK-LABEL: @test_svst3_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data) @@ -104,7 +104,7 @@ return SVE_ACLE_FUNC(svst3,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u8( +// CHECK-LABEL: @test_svst3_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -112,7 +112,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst3_u8u10__SVBool_tPh11svuint8x3_t( +// CPP-CHECK-LABEL: @_Z13test_svst3_u8u10__SVBool_tPh11svuint8x3_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) @@ -125,22 +125,22 @@ return SVE_ACLE_FUNC(svst3,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u16( +// CHECK-LABEL: @test_svst3_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data) @@ -148,22 +148,22 @@ return SVE_ACLE_FUNC(svst3,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u32( +// CHECK-LABEL: @test_svst3_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data) @@ -171,22 +171,22 @@ return SVE_ACLE_FUNC(svst3,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_u64( +// CHECK-LABEL: @test_svst3_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data) @@ -194,22 +194,22 @@ return SVE_ACLE_FUNC(svst3,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f16( +// CHECK-LABEL: @test_svst3_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data) @@ -217,22 +217,22 @@ return SVE_ACLE_FUNC(svst3,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f32( +// CHECK-LABEL: @test_svst3_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data) @@ -240,22 +240,22 @@ return SVE_ACLE_FUNC(svst3,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_f64( +// CHECK-LABEL: @test_svst3_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t( +// CPP-CHECK-LABEL: @_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data) @@ -263,22 +263,22 @@ return SVE_ACLE_FUNC(svst3,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s8( +// CHECK-LABEL: @test_svst3_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t( +// CPP-CHECK-LABEL: @_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data) @@ -286,24 +286,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s16( +// CHECK-LABEL: @test_svst3_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data) @@ -311,24 +311,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s32( +// CHECK-LABEL: @test_svst3_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data) @@ -336,24 +336,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s64( +// CHECK-LABEL: @test_svst3_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data) @@ -361,22 +361,22 @@ return SVE_ACLE_FUNC(svst3_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u8( +// CHECK-LABEL: @test_svst3_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t( +// CPP-CHECK-LABEL: @_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data) @@ -384,24 +384,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u16( +// CHECK-LABEL: @test_svst3_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data) @@ -409,24 +409,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u32( +// CHECK-LABEL: @test_svst3_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data) @@ -434,24 +434,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u64( +// CHECK-LABEL: @test_svst3_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data) @@ -459,24 +459,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f16( +// CHECK-LABEL: @test_svst3_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data) @@ -484,24 +484,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f32( +// CHECK-LABEL: @test_svst3_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data) @@ -509,24 +509,24 @@ return SVE_ACLE_FUNC(svst3_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f64( +// CHECK-LABEL: @test_svst3_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t( +// CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c @@ -15,24 +15,24 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst4_bf16( +// CHECK-LABEL: @test_svst4_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t( +// CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) @@ -40,26 +40,26 @@ return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_bf16( +// CHECK-LABEL: @test_svst4_vnum_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t( +// CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c @@ -14,7 +14,7 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s8( +// CHECK-LABEL: @test_svst4_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -23,7 +23,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst4_s8u10__SVBool_tPa10svint8x4_t( +// CPP-CHECK-LABEL: @_Z13test_svst4_s8u10__SVBool_tPa10svint8x4_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -37,24 +37,24 @@ return SVE_ACLE_FUNC(svst4,_s8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s16( +// CHECK-LABEL: @test_svst4_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data) @@ -62,24 +62,24 @@ return SVE_ACLE_FUNC(svst4,_s16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s32( +// CHECK-LABEL: @test_svst4_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data) @@ -87,24 +87,24 @@ return SVE_ACLE_FUNC(svst4,_s32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_s64( +// CHECK-LABEL: @test_svst4_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data) @@ -112,7 +112,7 @@ return SVE_ACLE_FUNC(svst4,_s64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u8( +// CHECK-LABEL: @test_svst4_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -121,7 +121,7 @@ // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z13test_svst4_u8u10__SVBool_tPh11svuint8x4_t( +// CPP-CHECK-LABEL: @_Z13test_svst4_u8u10__SVBool_tPh11svuint8x4_t( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) @@ -135,24 +135,24 @@ return SVE_ACLE_FUNC(svst4,_u8,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u16( +// CHECK-LABEL: @test_svst4_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data) @@ -160,24 +160,24 @@ return SVE_ACLE_FUNC(svst4,_u16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u32( +// CHECK-LABEL: @test_svst4_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data) @@ -185,24 +185,24 @@ return SVE_ACLE_FUNC(svst4,_u32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_u64( +// CHECK-LABEL: @test_svst4_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data) @@ -210,24 +210,24 @@ return SVE_ACLE_FUNC(svst4,_u64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f16( +// CHECK-LABEL: @test_svst4_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data) @@ -235,24 +235,24 @@ return SVE_ACLE_FUNC(svst4,_f16,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f32( +// CHECK-LABEL: @test_svst4_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data) @@ -260,24 +260,24 @@ return SVE_ACLE_FUNC(svst4,_f32,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_f64( +// CHECK-LABEL: @test_svst4_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t( +// CPP-CHECK-LABEL: @_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) // CPP-CHECK-NEXT: ret void // void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data) @@ -285,24 +285,24 @@ return SVE_ACLE_FUNC(svst4,_f64,,)(pg, base, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s8( +// CHECK-LABEL: @test_svst4_vnum_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t( +// CPP-CHECK-LABEL: @_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data) @@ -310,26 +310,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s16( +// CHECK-LABEL: @test_svst4_vnum_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data) @@ -337,26 +337,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s32( +// CHECK-LABEL: @test_svst4_vnum_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data) @@ -364,26 +364,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_s32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s64( +// CHECK-LABEL: @test_svst4_vnum_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data) @@ -391,24 +391,24 @@ return SVE_ACLE_FUNC(svst4_vnum,_s64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u8( +// CHECK-LABEL: @test_svst4_vnum_u8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t( +// CPP-CHECK-LABEL: @_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[DATA]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data) @@ -416,26 +416,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u8,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u16( +// CHECK-LABEL: @test_svst4_vnum_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data) @@ -443,26 +443,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u32( +// CHECK-LABEL: @test_svst4_vnum_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data) @@ -470,26 +470,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u64( +// CHECK-LABEL: @test_svst4_vnum_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data) @@ -497,26 +497,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_u64,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f16( +// CHECK-LABEL: @test_svst4_vnum_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[DATA]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data) @@ -524,26 +524,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_f16,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f32( +// CHECK-LABEL: @test_svst4_vnum_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[DATA]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data) @@ -551,26 +551,26 @@ return SVE_ACLE_FUNC(svst4_vnum,_f32,,)(pg, base, vnum, data); } -// CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f64( +// CHECK-LABEL: @test_svst4_vnum_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CHECK-NEXT: ret void // -// CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t( +// CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) -// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) -// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[DATA]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void // void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data) diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c @@ -0,0 +1,390 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svrevd_s8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_zu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_zu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_zu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_zu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_zu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _z, )(pg, op); +} +// CHECK-LABEL: @test_svrevd_u16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_zu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_zu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_zu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _z, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_mu10__SVInt8_tu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_mu11__SVInt16_tu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_mu11__SVInt32_tu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_mu11__SVInt64_tu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_mu11__SVUint8_tu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( [[INACTIVE:%.*]], [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_mu12__SVUint16_tu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_mu12__SVUint32_tu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_mu12__SVUint64_tu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( [[INACTIVE:%.*]], [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _m, )(inactive, pg, op); +} + +// CHECK-LABEL: @test_svrevd_s8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_s8_xu10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) { + return SVE_ACLE_FUNC(svrevd, _s8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s16_xu10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) { + return SVE_ACLE_FUNC(svrevd, _s16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s32_xu10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) { + return SVE_ACLE_FUNC(svrevd, _s32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_s64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_s64_xu10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) { + return SVE_ACLE_FUNC(svrevd, _s64, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u8_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svrevd_u8_xu10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.revd.nxv16i8( undef, [[PG:%.*]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) { + return SVE_ACLE_FUNC(svrevd, _u8, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u16_xu10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv8i16( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) { + return SVE_ACLE_FUNC(svrevd, _u16, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u32_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u32_xu10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv4i32( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) { + return SVE_ACLE_FUNC(svrevd, _u32, _x, )(pg, op); +} + +// CHECK-LABEL: @test_svrevd_u64_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svrevd_u64_xu10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.revd.nxv2i64( undef, [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) { + return SVE_ACLE_FUNC(svrevd, _u64, _x, )(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_addqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_addqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_addqv.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svaddqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svaddqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_svaddqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(svaddqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_svaddqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(svaddqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_svaddqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(svaddqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_svaddqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(svaddqv,_s64,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svaddqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_svaddqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(svaddqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_svaddqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(svaddqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_svaddqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(svaddqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svaddqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_svaddqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(svaddqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_andqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_andqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_andqv.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svandqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svandqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_svandqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(svandqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_svandqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(svandqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_svandqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(svandqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_svandqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(svandqv,_s64,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svandqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_svandqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(svandqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_svandqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(svandqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_svandqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(svandqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svandqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svandqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_svandqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(svandqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_eorqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_eorqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_eorqv.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_sveorqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_sveorqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_sveorqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(sveorqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_sveorqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(sveorqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_sveorqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(sveorqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_sveorqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(sveorqv,_s64,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_sveorqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_sveorqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(sveorqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_sveorqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(sveorqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_sveorqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(sveorqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_sveorqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_sveorqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_sveorqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(sveorqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_faddqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_faddqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_faddqv.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svaddqv_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_f16u10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_svaddqv_f16(svbool_t pg, svfloat16_t op) +{ + return SVE_ACLE_FUNC(svaddqv,,_f16,)(pg, op); +} + +// CHECK-LABEL: @test_svaddqv_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_f32u10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_svaddqv_f32(svbool_t pg, svfloat32_t op) +{ + return SVE_ACLE_FUNC(svaddqv,,_f32,)(pg, op); +} + +// CHECK-LABEL: @test_svaddqv_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svaddqv_f64u10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <2 x double> [[TMP1]] +// +float64x2_t test_svaddqv_f64(svbool_t pg, svfloat64_t op) +{ + return SVE_ACLE_FUNC(svaddqv,,_f64,)(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fmaxqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fmaxqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fmaxqv.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmaxqv_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fmaxqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_f16u10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fmaxqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_svmaxqv_f16(svbool_t pg, svfloat16_t op) +{ + return SVE_ACLE_FUNC(svmaxqv,,_f16,)(pg, op); +} + +// CHECK-LABEL: @test_svmaxqv_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fmaxqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_f32u10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fmaxqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_svmaxqv_f32(svbool_t pg, svfloat32_t op) +{ + return SVE_ACLE_FUNC(svmaxqv,,_f32,)(pg, op); +} + +// CHECK-LABEL: @test_svmaxqv_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fmaxqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_f64u10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fmaxqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <2 x double> [[TMP1]] +// +float64x2_t test_svmaxqv_f64(svbool_t pg, svfloat64_t op) +{ + return SVE_ACLE_FUNC(svmaxqv,,_f64,)(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fminqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fminqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_fminqv.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svminqv_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fminqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_f16u10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fminqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_svminqv_f16(svbool_t pg, svfloat16_t op) +{ + return SVE_ACLE_FUNC(svminqv,,_f16,)(pg, op); +} + +// CHECK-LABEL: @test_svminqv_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fminqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_f32u10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fminqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_svminqv_f32(svbool_t pg, svfloat32_t op) +{ + return SVE_ACLE_FUNC(svminqv,,_f32,)(pg, op); +} + +// CHECK-LABEL: @test_svminqv_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fminqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_f64u10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fminqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <2 x double> [[TMP1]] +// +float64x2_t test_svminqv_f64(svbool_t pg, svfloat64_t op) +{ + return SVE_ACLE_FUNC(svminqv,,_f64,)(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_maxnmvqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_maxnmvqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_maxnmvqv.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmaxnmqv_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fmaxnmqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svmaxnmqv_f16u10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fmaxnmqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_svmaxnmqv_f16(svbool_t pg, svfloat16_t op) +{ + return SVE_ACLE_FUNC(svmaxnmqv,,_f16,)(pg, op); +} + +// CHECK-LABEL: @test_svmaxnmqv_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fmaxnmqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svmaxnmqv_f32u10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fmaxnmqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_svmaxnmqv_f32(svbool_t pg, svfloat32_t op) +{ + return SVE_ACLE_FUNC(svmaxnmqv,,_f32,)(pg, op); +} + +// CHECK-LABEL: @test_svmaxnmqv_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fmaxnmqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svmaxnmqv_f64u10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fmaxnmqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <2 x double> [[TMP1]] +// +float64x2_t test_svmaxnmqv_f64(svbool_t pg, svfloat64_t op) +{ + return SVE_ACLE_FUNC(svmaxnmqv,,_f64,)(pg, op); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_minnmvqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_minnmvqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_minnmvqv.c @@ -0,0 +1,67 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svminnmqv_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fminnmqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svminnmqv_f16u10__SVBool_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.fminnmqv.v8f16.nxv8f16( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_svminnmqv_f16(svbool_t pg, svfloat16_t op) +{ + return SVE_ACLE_FUNC(svminnmqv,,_f16,)(pg, op); +} + +// CHECK-LABEL: @test_svminnmqv_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fminnmqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svminnmqv_f32u10__SVBool_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.fminnmqv.v4f32.nxv4f32( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_svminnmqv_f32(svbool_t pg, svfloat32_t op) +{ + return SVE_ACLE_FUNC(svminnmqv,,_f32,)(pg, op); +} + +// CHECK-LABEL: @test_svminnmqv_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fminnmqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z18test_svminnmqv_f64u10__SVBool_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.fminnmqv.v2f64.nxv2f64( [[TMP0]], [[OP:%.*]]) +// CPP-CHECK-NEXT: ret <2 x double> [[TMP1]] +// +float64x2_t test_svminnmqv_f64(svbool_t pg, svfloat64_t op) +{ + return SVE_ACLE_FUNC(svminnmqv,,_f64,)(pg, op); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_orqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_orqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_orqv.c @@ -0,0 +1,140 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svorqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z14test_svorqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_svorqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(svorqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_svorqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(svorqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_svorqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(svorqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_svorqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(svorqv,_s64,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z14test_svorqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_svorqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(svorqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_svorqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(svorqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_svorqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(svorqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svorqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z15test_svorqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_svorqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(svorqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_smaxqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_smaxqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_smaxqv.c @@ -0,0 +1,78 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmaxqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.smaxqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svmaxqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.smaxqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_svmaxqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.smaxqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.smaxqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_svmaxqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.smaxqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.smaxqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_svmaxqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.smaxqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.smaxqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_svmaxqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_s64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_sminqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_sminqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_sminqv.c @@ -0,0 +1,78 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svminqv_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.sminqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svminqv_s8u10__SVBool_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.sminqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_svminqv_s8(svbool_t pg, svint8_t op1) { + return SVE_ACLE_FUNC(svminqv,_s8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.sminqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_s16u10__SVBool_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.sminqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_svminqv_s16(svbool_t pg, svint16_t op1) { + return SVE_ACLE_FUNC(svminqv,_s16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.sminqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_s32u10__SVBool_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.sminqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_svminqv_s32(svbool_t pg, svint32_t op1) { + return SVE_ACLE_FUNC(svminqv,_s32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.sminqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_s64u10__SVBool_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.sminqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +int64x2_t test_svminqv_s64(svbool_t pg, svint64_t op1) { + return SVE_ACLE_FUNC(svminqv,_s64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_umaxqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_umaxqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_umaxqv.c @@ -0,0 +1,78 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmaxqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.umaxqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svmaxqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.umaxqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_svmaxqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.umaxqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.umaxqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_svmaxqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.umaxqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.umaxqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_svmaxqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svmaxqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.umaxqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svmaxqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.umaxqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_svmaxqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(svmaxqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_uminqv.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_uminqv.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2_uminqv.c @@ -0,0 +1,78 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svminqv_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.uminqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svminqv_u8u10__SVBool_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <16 x i8> @llvm.aarch64.sve.uminqv.v16i8.nxv16i8( [[PG:%.*]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_svminqv_u8(svbool_t pg, svuint8_t op1) { + return SVE_ACLE_FUNC(svminqv,_u8,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.uminqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_u16u10__SVBool_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.aarch64.sve.uminqv.v8i16.nxv8i16( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_svminqv_u16(svbool_t pg, svuint16_t op1) { + return SVE_ACLE_FUNC(svminqv,_u16,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.uminqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_u32u10__SVBool_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.aarch64.sve.uminqv.v4i32.nxv4i32( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_svminqv_u32(svbool_t pg, svuint32_t op1) { + return SVE_ACLE_FUNC(svminqv,_u32,,)(pg, op1); +} + +// CHECK-LABEL: @test_svminqv_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.uminqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z16test_svminqv_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.aarch64.sve.uminqv.v2i64.nxv2i64( [[TMP0]], [[OP1:%.*]]) +// CPP-CHECK-NEXT: ret <2 x i64> [[TMP1]] +// +uint64x2_t test_svminqv_u64(svbool_t pg, svuint64_t op1) { + return SVE_ACLE_FUNC(svminqv,_u64,,)(pg, op1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c @@ -0,0 +1,133 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svadd_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _bf16, _x)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fadd.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svadd_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fadd.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svadd_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svadd, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c @@ -0,0 +1,31 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svclamp_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svclamp_bf16u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svclamp_bf16(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svclamp, _bf16,)(op1, op2, op3); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmax_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svmax_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmax.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmax_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmax.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmax_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmax, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmaxnm_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svmaxnm_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmaxnm_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmaxnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmin_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svmin_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmin.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmin_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmin.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmin_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmin, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svminnm_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svminnm_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fminnm.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svminnm_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fminnm.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svminnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svminnm, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c @@ -0,0 +1,133 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmla_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _bf16, _m)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmla_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _bf16, _z)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmla_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _bf16, _x)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmla_n_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _n_bf16, _m)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmla_n_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmla.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _n_bf16, _z)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmla_n_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmla.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmla_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla, _n_bf16, _x)(pg, op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c @@ -0,0 +1,60 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmla_lane_bf16_idx1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmla_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 1); +} + +// CHECK-LABEL: @test_svmla_lane_bf16_idx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmla_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla_lane ,_bf16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svmla_lane_bf16_idx7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmla.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmla_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c @@ -0,0 +1,133 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmls_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _bf16, _m)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmls_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _bf16, _z)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmls_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _bf16, _x)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmls_n_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _n_bf16, _m)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmls_n_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmls.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _n_bf16, _z)(pg, op1, op2, op3); +} + +// CHECK-LABEL: @test_svmls_n_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP3:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmls.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmls_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls, _n_bf16, _x)(pg, op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c @@ -0,0 +1,60 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmls_lane_bf16_idx1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmls_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 1); +} + +// CHECK-LABEL: @test_svmls_lane_bf16_idx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmls_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svmls_lane_bf16_idx7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmls.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmls_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) +{ + return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c @@ -0,0 +1,87 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// BFMLSLB + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslb( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_bfmlslbu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslb(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslb,_f32,,)(zda, zn, zm); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslb_idx( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_bfmlslb_idxu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslb.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslb_idx(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslb_lane,_f32,,)(zda, zn, zm, 7); +} + +// BFMLSLT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslt( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z12test_bfmlsltu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslt(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslt,_f32,,)(zda, zn, zm); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_bfmlslt_idx( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_bfmlslt_idxu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.bfmlslt.lane( [[ZDA:%.*]], [[ZN:%.*]], [[ZM:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_bfmlslt_idx(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) +{ + return SVE_ACLE_FUNC(svbfmlslt_lane,_f32,,)(zda, zn, zm, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmul_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svmul_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fmul.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svmul_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fmul.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svmul_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c @@ -0,0 +1,61 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svmul_lane_bf16_idx1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 1); +} + +// CHECK-LABEL: @test_svmul_lane_bf16_idx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 3); +} + +// CHECK-LABEL: @test_svmul_lane_bf16_idx7( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmul.lane.nxv8bf16( [[OP1:%.*]], [[OP2:%.*]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svmul_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 7); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svsub_bf16_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[OP2:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _bf16, _x)(pg, op1, op2); +} + + +// CHECK-LABEL: @test_svsub_bf16_n_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _n_bf16, _m)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_n_z( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = select [[TMP0]], [[OP1:%.*]], zeroinitializer +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.fsub.nxv8bf16( [[TMP0]], [[TMP1]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _n_bf16, _z)(pg, op1, op2); +} + +// CHECK-LABEL: @test_svsub_bf16_n_x( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, bfloat [[OP2:%.*]], i64 0 +// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fsub.u.nxv8bf16( [[TMP0]], [[OP1:%.*]], [[DOTSPLAT]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svsub_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) +{ + return SVE_ACLE_FUNC(svsub, _n_bf16, _x)(pg, op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_clamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_clamp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_clamp.c @@ -0,0 +1,187 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// SCLAMP + +// CHECK-LABEL: @test_svclamp_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svclamp_s8u10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svclamp_s8(svint8_t op1, svint8_t op2, svint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _s8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s16u11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svclamp_s16(svint16_t op1, svint16_t op2, svint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _s16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s32u11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svclamp_s32(svint32_t op1, svint32_t op2, svint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _s32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_s64u11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svclamp_s64(svint64_t op1, svint64_t op2, svint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _s64, , )(op1, op2, op3); +} + + +// UCLAMP + +// CHECK-LABEL: @test_svclamp_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svclamp_u8u11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv16i8( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svclamp_u8(svuint8_t op1, svuint8_t op2, svuint8_t op3) { + return SVE_ACLE_FUNC(svclamp, _u8, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u16u12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv8i16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svclamp_u16(svuint16_t op1, svuint16_t op2, svuint16_t op3) { + return SVE_ACLE_FUNC(svclamp, _u16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u32u12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svclamp_u32(svuint32_t op1, svuint32_t op2, svuint32_t op3) { + return SVE_ACLE_FUNC(svclamp, _u32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_u64u12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uclamp.nxv2i64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svclamp_u64(svuint64_t op1, svuint64_t op2, svuint64_t op3) { + return SVE_ACLE_FUNC(svclamp, _u64, , )(op1, op2, op3); +} + + +// FCLAMP + +// CHECK-LABEL: @test_svclamp_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8f16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f16u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv8f16( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svclamp_f16(svfloat16_t op1, svfloat16_t op2, svfloat16_t op3) { + return SVE_ACLE_FUNC(svclamp, _f16, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f32u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svclamp_f32(svfloat32_t op1, svfloat32_t op2, svfloat32_t op3) { + return SVE_ACLE_FUNC(svclamp, _f32, , )(op1, op2, op3); +} + +// CHECK-LABEL: @test_svclamp_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv2f64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z16test_svclamp_f64u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fclamp.nxv2f64( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svclamp_f64(svfloat64_t op1, svfloat64_t op2, svfloat64_t op3) { + return SVE_ACLE_FUNC(svclamp, _f64, , )(op1, op2, op3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c @@ -0,0 +1,133 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -DIGNORE_STREAMING_ATTR -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c8_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z19test_svcntp_c8_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c8_vlx2(svcount_t pnn) { + return svcntp_c8(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c8_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z19test_svcntp_c8_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c8(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c8_vlx4(svcount_t pnn) { + return svcntp_c8(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c16_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c16_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c16_vlx2(svcount_t pnn) { + return svcntp_c16(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c16_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c16_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c16(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c16_vlx4(svcount_t pnn) { + return svcntp_c16(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c32_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c32_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c32_vlx2(svcount_t pnn) { + return svcntp_c32(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c32_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c32_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c32(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c32_vlx4(svcount_t pnn) { + return svcntp_c32(pnn, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c64_vlx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c64_vlx2u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 2) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c64_vlx2(svcount_t pnn) { + return svcntp_c64(pnn, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svcntp_c64_vlx4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CHECK-NEXT: ret i64 [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svcntp_c64_vlx4u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sve.cntp.c64(target("aarch64.svcount") [[PNN:%.*]], i32 4) +// CPP-CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_svcntp_c64_vlx4(svcount_t pnn) { + return svcntp_c64(pnn, 4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c @@ -0,0 +1,109 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_svdot_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_s32_x2u11__SVInt32_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svdot_s32_x2(svint32_t op1, svint16_t op2, svint16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_s32_s16_s16,)(op1, op2, op3); +} + +// CHECK-LABEL: @test_svdot_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_u32_x2u12__SVUint32_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svdot_u32_x2(svuint32_t op1, svuint16_t op2, svuint16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_u32_u16_u16,)(op1, op2, op3); +} + +// CHECK-LABEL: @test_svdot_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_svdot_f32_x2u13__SVFloat32_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svdot_f32_x2(svfloat32_t op1, svfloat16_t op2, svfloat16_t op3) +{ + return SVE_ACLE_FUNC(svdot,_f32_f16_f16,)(op1, op2, op3); +} + + + +// CHECK-LABEL: @test_svdot_lane_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_s32_x2u11__SVInt32_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.sdot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svdot_lane_s32_x2(svint32_t op1, svint16_t op2, svint16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_s32_s16_s16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svdot_lane_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_u32_x2u12__SVUint32_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.udot.lane.x2.nxv4i32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svdot_lane_u32_x2(svuint32_t op1, svuint16_t op2, svuint16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_u32_u16_u16,)(op1, op2, op3, 3); +} + +// CHECK-LABEL: @test_svdot_lane_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.lane.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svdot_lane_f32_x2u13__SVFloat32_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fdot.lane.x2.nxv4f32( [[OP1:%.*]], [[OP2:%.*]], [[OP3:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svdot_lane_f32_x2(svfloat32_t op1, svfloat16_t op2, svfloat16_t op3) +{ + return SVE_ACLE_FUNC(svdot_lane,_f32_f16_f16,)(op1, op2, op3, 3); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c @@ -0,0 +1,213 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svextq_lane_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv16i8( [[ZN]], [[ZM]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z19test_svextq_lane_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv16i8( [[ZN]], [[ZM]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svextq_lane_u8(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _u8,,)(zn, zm, 0); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv16i8( [[ZN]], [[ZM]], i32 4) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z19test_svextq_lane_s8u10__SVInt8_tu10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv16i8( [[ZN]], [[ZM]], i32 4) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svextq_lane_s8(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _s8,,)(zn, zm, 4); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8i16( [[ZN]], [[ZM]], i32 1) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8i16( [[ZN]], [[ZM]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svextq_lane_u16(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _u16,,)(zn, zm, 1); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8i16( [[ZN]], [[ZM]], i32 5) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_s16u11__SVInt16_tu11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8i16( [[ZN]], [[ZM]], i32 5) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svextq_lane_s16(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _s16,,)(zn, zm, 5); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4i32( [[ZN]], [[ZM]], i32 2) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4i32( [[ZN]], [[ZM]], i32 2) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svextq_lane_u32(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _u32,,)(zn, zm, 2); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4i32( [[ZN]], [[ZM]], i32 6) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_s32u11__SVInt32_tu11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4i32( [[ZN]], [[ZM]], i32 6) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svextq_lane_s32(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _s32,,)(zn, zm, 6); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2i64( [[ZN]], [[ZM]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2i64( [[ZN]], [[ZM]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svextq_lane_u64(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _u64,,)(zn, zm, 3); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2i64( [[ZN]], [[ZM]], i32 7) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_s64u11__SVInt64_tu11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2i64( [[ZN]], [[ZM]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svextq_lane_s64(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _s64,,)(zn, zm, 7); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8f16( [[ZN]], [[ZM]], i32 8) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_f16u13__SVFloat16_tu13__SVFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8f16( [[ZN]], [[ZM]], i32 8) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svextq_lane_f16(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _f16,,)(zn, zm, 8); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4f32( [[ZN]], [[ZM]], i32 9) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_f32u13__SVFloat32_tu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv4f32( [[ZN]], [[ZM]], i32 9) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svextq_lane_f32(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _f32,,)(zn, zm, 9); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2f64( [[ZN]], [[ZM]], i32 10) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svextq_lane_f64u13__SVFloat64_tu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv2f64( [[ZN]], [[ZM]], i32 10) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svextq_lane_f64(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _f64,,)(zn, zm, 10); +} + +// CHECK-LABEL: define dso_local @test_svextq_lane_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8bf16( [[ZN]], [[ZM]], i32 11) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z21test_svextq_lane_bf16u14__SVBFloat16_tu14__SVBFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.extq.lane.nxv8bf16( [[ZN]], [[ZM]], i32 11) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svextq_lane_bf16(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svextq_lane, _bf16,,)(zn, zm, 11); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c @@ -0,0 +1,1300 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x2u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x2u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x2u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x2u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_u8_x4u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u16_x4u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u32_x4u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_u64_x4u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_u64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x2u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x2u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x2u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s64_x2u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svld1_s8_x4u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s16_x4u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s32_x4u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_s64_x4u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_s64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f16_x2u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f32_x2u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f64_x2u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f16_x4u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f32_x4u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) + +// CHECK-LABEL: @test_svld1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svld1_f64_x4u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld1,_f64,_x4,)(pn, base); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x2u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x2u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x2u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x2u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x4u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x4u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x4u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x4u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_u64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x2u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x2u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x2u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x2u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x4u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x4u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x4u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x4u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_s64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x2u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x2u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x2u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x4u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x4u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svld1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x4u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld1_vnum,_f64,_x4,)(pn, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c @@ -0,0 +1,255 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// LD1W + +// CHECK-LABEL: define dso_local @test_svld1uwq_u32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1uwq_u32u10__SVBool_tPKj +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svld1uwq_u32(svbool_t pred, uint32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq, _u32, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1uwq_vnum_u32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1uwq_vnum_u32u10__SVBool_tPKj +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint32_t test_svld1uwq_vnum_u32(svbool_t pred, uint32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq_vnum, _u32, , )(pred, base, -8); +} + +// CHECK-LABEL: define dso_local @test_svld1uwq_s32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1uwq_s32u10__SVBool_tPKi +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svld1uwq_s32(svbool_t pred, int32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq, _s32, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1uwq_vnum_s32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1uwq_vnum_s32u10__SVBool_tPKi +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint32_t test_svld1uwq_vnum_s32(svbool_t pred, int32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq_vnum, _s32, , )(pred, base, 7); +} + +// CHECK-LABEL: define dso_local @test_svld1uwq_f32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4f32( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1uwq_f32u10__SVBool_tPKf +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4f32( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svld1uwq_f32(svbool_t pred, float32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq, _f32, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1uwq_vnum_f32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1uwq_vnum_f32u10__SVBool_tPKf +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1uwq.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat32_t test_svld1uwq_vnum_f32(svbool_t pred, float32_t const * base) { + return SVE_ACLE_FUNC(svld1uwq_vnum, _f32, , )(pred, base, -8); +} + + +// LD1D + +// CHECK-LABEL: define dso_local @test_svld1udq_u64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1udq_u64u10__SVBool_tPKm +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svld1udq_u64(svbool_t pred, uint64_t const * base) { + return SVE_ACLE_FUNC(svld1udq, _u64, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1udq_vnum_u64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1udq_vnum_u64u10__SVBool_tPKm +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint64_t test_svld1udq_vnum_u64(svbool_t pred, uint64_t const * base) { + return SVE_ACLE_FUNC(svld1udq_vnum, _u64, , )(pred, base, 7); +} + +// CHECK-LABEL: define dso_local @test_svld1udq_s64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1udq_s64u10__SVBool_tPKl +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svld1udq_s64(svbool_t pred, int64_t const * base) { + return SVE_ACLE_FUNC(svld1udq, _s64, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1udq_vnum_s64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1udq_vnum_s64u10__SVBool_tPKl +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint64_t test_svld1udq_vnum_s64(svbool_t pred, int64_t const * base) { + return SVE_ACLE_FUNC(svld1udq_vnum, _s64, , )(pred, base, -8); +} + +// CHECK-LABEL: define dso_local @test_svld1udq_f64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2f64( [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svld1udq_f64u10__SVBool_tPKd +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2f64( [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svld1udq_f64(svbool_t pred, float64_t const * base) { + return SVE_ACLE_FUNC(svld1udq, _f64, , )(pred, base); +} + +// CHECK-LABEL: define dso_local @test_svld1udq_vnum_f64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svld1udq_vnum_f64u10__SVBool_tPKd +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.ld1udq.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svfloat64_t test_svld1udq_vnum_f64(svbool_t pred, float64_t const * base) { + return SVE_ACLE_FUNC(svld1udq_vnum, _f64, , )(pred, base, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c @@ -0,0 +1,1299 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_u8_x2u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u16_x2u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u32_x2u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u64_x2u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_u8_x4u11__SVCount_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u16_x4u11__SVCount_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u32_x4u11__SVCount_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_u64_x4u11__SVCount_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_u64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_s8_x2u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s8,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s16_x2u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s32_x2u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s64_x2u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svldnt1_s8_x4u11__SVCount_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s8,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s16_x4u11__SVCount_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s32_x4u11__SVCount_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_s64_x4u11__SVCount_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_s64,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f16_x2u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f16,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f32_x2u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f32,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f64_x2u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f64,_x2,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f16_x4u11__SVCount_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f16,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f32_x4u11__SVCount_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f32,_x4,)(pn, base); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svldnt1_f64_x4u11__SVCount_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base) +{ + return SVE_ACLE_FUNC(svldnt1,_f64,_x4,)(pn, base); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x2u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x2u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x2u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x2u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x4u11__SVCount_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x4u11__SVCount_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x4u11__SVCount_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x4u11__SVCount_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x2u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x2u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x2u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x2u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x4u11__SVCount_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x4u11__SVCount_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x4u11__SVCount_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x4u11__SVCount_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x2u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x2u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x2u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x2,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x4u11__SVCount_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x4u11__SVCount_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x4,)(pn, base, vnum); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svldnt1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x4u11__SVCount_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x4,)(pn, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c @@ -0,0 +1,2495 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svld2q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z14test_svld2q_u8u10__SVBool_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svld2q_u8(svbool_t pg, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_u8,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z14test_svld2q_s8u10__SVBool_tPKa( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svld2q_s8(svbool_t pg, const int8_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_s8,)(pg, base); +} +// CHECK-LABEL: @test_svld2q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_u16u10__SVBool_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint16x2_t test_svld2q_u16(svbool_t pg, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_u16,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_s16u10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint16x2_t test_svld2q_s16(svbool_t pg, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_s16,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_u32u10__SVBool_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint32x2_t test_svld2q_u32(svbool_t pg, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_u32,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_s32u10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint32x2_t test_svld2q_s32(svbool_t pg, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_s32,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_u64u10__SVBool_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint64x2_t test_svld2q_u64(svbool_t pg, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_u64,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_s64u10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint64x2_t test_svld2q_s64(svbool_t pg, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_s64,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_f16u10__SVBool_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat16x2_t test_svld2q_f16(svbool_t pg, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_f16,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z16test_svld2q_bf16u10__SVBool_tPKu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svbfloat16x2_t test_svld2q_bf16(svbool_t pg, const bfloat16_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_bf16,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_f32u10__SVBool_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat32x2_t test_svld2q_f32(svbool_t pg, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_f32,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z15test_svld2q_f64u10__SVBool_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svfloat64x2_t test_svld2q_f64(svbool_t pg, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld2q,,_f64,)(pg, base); +} + +// CHECK-LABEL: @test_svld2q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_u8u10__SVBool_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum_,,u8,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: ret [[TMP5]] +// +// CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_s8u10__SVBool_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP5]] +// +svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_s8,)(pg, base, vnum); +} +// CHECK-LABEL: @test_svld2q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u16u10__SVBool_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_u16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s16u10__SVBool_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_s16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u32u10__SVBool_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_u32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s32u10__SVBool_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_s32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u64u10__SVBool_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_u64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s64u10__SVBool_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_s64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f16u10__SVBool_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_f16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svld2q_vnum_bf16u10__SVBool_tPKu6__bf16l( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_bf16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f32u10__SVBool_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_f32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld2q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f64u10__SVBool_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svfloat64x2_t test_svld2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld2q_vnum,,_f64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z14test_svld3q_u8u10__SVBool_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svuint8x3_t test_svld3q_u8(svbool_t pg, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_u8,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_u16u10__SVBool_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svuint16x3_t test_svld3q_u16(svbool_t pg, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_u16,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_s16u10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svint16x3_t test_svld3q_s16(svbool_t pg, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_s16,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_u32u10__SVBool_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svuint32x3_t test_svld3q_u32(svbool_t pg, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_u32,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_s32u10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svint32x3_t test_svld3q_s32(svbool_t pg, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_s32,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_u64u10__SVBool_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svuint64x3_t test_svld3q_u64(svbool_t pg, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_u64,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_s64u10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svint64x3_t test_svld3q_s64(svbool_t pg, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_s64,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_f16u10__SVBool_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svfloat16x3_t test_svld3q_f16(svbool_t pg, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_f16,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z16test_svld3q_bf16u10__SVBool_tPKu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svbfloat16x3_t test_svld3q_bf16(svbool_t pg, const bfloat16_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_bf16,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_f32u10__SVBool_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svfloat32x3_t test_svld3q_f32(svbool_t pg, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_f32,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z15test_svld3q_f64u10__SVBool_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svfloat64x3_t test_svld3q_f64(svbool_t pg, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld3q,,_f64,)(pg, base); +} + +// CHECK-LABEL: @test_svld3q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_u8u10__SVBool_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum_,,u8,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: ret [[TMP7]] +// +// CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_s8u10__SVBool_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv48i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: ret [[TMP7]] +// +svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_s8,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u16u10__SVBool_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_u16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s16u10__SVBool_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_s16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u32u10__SVBool_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_u32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s32u10__SVBool_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_s32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u64u10__SVBool_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_u64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s64u10__SVBool_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_s64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f16u10__SVBool_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24f16.nxv8f16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_f16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z21test_svld3q_vnum_bf16u10__SVBool_tPKu6__bf16l( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv24bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_bf16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f32u10__SVBool_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv12f32.nxv4f32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_f32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld3q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f64u10__SVBool_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv6f64.nxv2f64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x3_t test_svld3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld3q_vnum,,_f64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z14test_svld4q_u8u10__SVBool_tPKh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svld4q_u8(svbool_t pg, const uint8_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_u8,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_u16u10__SVBool_tPKt( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint16x4_t test_svld4q_u16(svbool_t pg, const uint16_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_u16,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_s16u10__SVBool_tPKs( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint16x4_t test_svld4q_s16(svbool_t pg, const int16_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_s16,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_u32u10__SVBool_tPKj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint32x4_t test_svld4q_u32(svbool_t pg, const uint32_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_u32,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_s32u10__SVBool_tPKi( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint32x4_t test_svld4q_s32(svbool_t pg, const int32_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_s32,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_u64u10__SVBool_tPKm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint64x4_t test_svld4q_u64(svbool_t pg, const uint64_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_u64,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_s64u10__SVBool_tPKl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint64x4_t test_svld4q_s64(svbool_t pg, const int64_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_s64,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_f16u10__SVBool_tPKDh( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat16x4_t test_svld4q_f16(svbool_t pg, const float16_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_f16,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z16test_svld4q_bf16u10__SVBool_tPKu6__bf16( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP3]], [[TMP4]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP5]], [[TMP6]], i64 16) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP7]], [[TMP8]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svbfloat16x4_t test_svld4q_bf16(svbool_t pg, const bfloat16_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_bf16,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_f32u10__SVBool_tPKf( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP3]], [[TMP4]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP5]], [[TMP6]], i64 8) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP7]], [[TMP8]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat32x4_t test_svld4q_f32(svbool_t pg, const float32_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_f32,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z15test_svld4q_f64u10__SVBool_tPKd( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP3]], [[TMP4]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP5]], [[TMP6]], i64 4) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP7]], [[TMP8]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svfloat64x4_t test_svld4q_f64(svbool_t pg, const float64_t *base) +{ + return SVE_ACLE_FUNC(svld4q,,_f64,)(pg, base); +} + +// CHECK-LABEL: @test_svld4q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_u8u10__SVBool_tPKhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum_,,u8,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CHECK-NEXT: ret [[TMP9]] +// +// CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_s8u10__SVBool_tPKal( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP0]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[TMP1]], 0 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP1]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP3]], [[TMP4]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP1]], 2 +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP5]], [[TMP6]], i64 32) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP1]], 3 +// CPP-CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP7]], [[TMP8]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP9]] +// +svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_s8,)(pg, base, vnum); +} +// CHECK-LABEL: @test_svld4q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u16u10__SVBool_tPKtl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_u16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s16u10__SVBool_tPKsl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_s16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u32u10__SVBool_tPKjl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_u32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s32u10__SVBool_tPKil( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_s32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u64u10__SVBool_tPKml( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_u64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s64u10__SVBool_tPKll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_s64,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f16u10__SVBool_tPKDhl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_f16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z21test_svld4q_vnum_bf16u10__SVBool_tPKu6__bf16l( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_bf16,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f32u10__SVBool_tPKfl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_f32,)(pg, base, vnum); +} + +// CHECK-LABEL: @test_svld4q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f64u10__SVBool_tPKdl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svfloat64x4_t test_svld4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum) +{ + return SVE_ACLE_FUNC(svld4q_vnum,,_f64,)(pg, base, vnum); +} + + +// Gather for 128 bits +// vector base + scalar offset +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u64u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svld1q_gather_u64base_offset_u64(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u64,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s64u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svld1q_gather_u64base_offset_s64(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s64,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u32u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svld1q_gather_u64base_offset_u32(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u32,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s32u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svld1q_gather_u64base_offset_s32(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s32,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_u16u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svld1q_gather_u64base_offset_u16(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u16,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_s16u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svld1q_gather_u64base_offset_s16(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s16,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_offset_u8u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svld1q_gather_u64base_offset_u8(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_u8,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z36test_svld1q_gather_u64base_offset_s8u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svld1q_gather_u64base_offset_s8(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_s8,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f64u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svld1q_gather_u64base_offset_f64(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f64,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f32u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svld1q_gather_u64base_offset_f32(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f32,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z37test_svld1q_gather_u64base_offset_f16u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svld1q_gather_u64base_offset_f16(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_f16,)(pg, base, offset); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_offset_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z38test_svld1q_gather_u64base_offset_bf16u10__SVBool_tu12__SVUint64_tl( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svld1q_gather_u64base_offset_bf16(svbool_t pg, svuint64_t base, int64_t offset) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_offset_bf16,)(pg, base, offset); +} + +// Vector base and no offset +// CHECK-LABEL: @test_svld1q_gather_u64base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svld1q_gather_u64base_u64(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u64,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svld1q_gather_u64base_s64(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s64,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u32u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svld1q_gather_u64base_u32(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u32,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s32u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svld1q_gather_u64base_s32(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s32,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_u16u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svld1q_gather_u64base_u16(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u16,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_s16u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svld1q_gather_u64base_s16(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s16,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z29test_svld1q_gather_u64base_u8u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svld1q_gather_u64base_u8(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_u8,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z29test_svld1q_gather_u64base_s8u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( [[PG:%.*]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svld1q_gather_u64base_s8(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_s8,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f64u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat64_t test_svld1q_gather_u64base_f64(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f64,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f32u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat32_t test_svld1q_gather_u64base_f32(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f32,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z30test_svld1q_gather_u64base_f16u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svfloat16_t test_svld1q_gather_u64base_f16(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_f16,)(pg, base); +} + +// CHECK-LABEL: @test_svld1q_gather_u64base_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z31test_svld1q_gather_u64base_bf16u10__SVBool_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64( [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbfloat16_t test_svld1q_gather_u64base_bf16(svbool_t pg, svuint64_t base) +{ + return SVE_ACLE_FUNC(svld1q_gather,_u64base,_bf16,)(pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c @@ -0,0 +1,353 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -DIGNORE_STREAMING_ATTR -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svpext_lane_c8_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpext_lane_c8_0(svcount_t c) { + return svpext_lane_c8(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svpext_lane_c8_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpext_lane_c8_3(svcount_t c) { + return svpext_lane_c8(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c16_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c16_0(svcount_t c) { + return svpext_lane_c16(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c16_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c16_3(svcount_t c) { + return svpext_lane_c16(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c32_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c32_0(svcount_t c) { + return svpext_lane_c32(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c32_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c32_3(svcount_t c) { + return svpext_lane_c32(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c64_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c64_0(svcount_t c) { + return svpext_lane_c64(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z22test_svpext_lane_c64_3u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpext_lane_c64_3(svcount_t c) { + return svpext_lane_c64(c, 3); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svpext_lane_c8_x2_0(svcount_t c) { + return svpext_lane_c8_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c8_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z24test_svpext_lane_c8_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svpext_lane_c8_x2_1(svcount_t c) { + return svpext_lane_c8_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c16_x2_0(svcount_t c) { + return svpext_lane_c16_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c16_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c16_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c16_x2_1(svcount_t c) { + return svpext_lane_c16_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c32_x2_0(svcount_t c) { + return svpext_lane_c32_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c32_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c32_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c32_x2_1(svcount_t c) { + return svpext_lane_c32_x2(c, 1); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_x2_0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_0u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c64_x2_0(svcount_t c) { + return svpext_lane_c64_x2(c, 0); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svpext_lane_c64_x2_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z25test_svpext_lane_c64_x2_1u11__SVCount_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") [[C:%.*]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svpext_lane_c64_x2_1(svcount_t c) { + return svpext_lane_c64_x2(c, 1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c @@ -0,0 +1,32 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svpfalse_c( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( zeroinitializer) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z15test_svpfalse_cv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( zeroinitializer) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svpfalse_c() +{ + return SVE_ACLE_FUNC(svpfalse_c,,,)(); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c @@ -0,0 +1,162 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u8 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8( [[ZN]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z19test_svpmov_lane_u8u11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8( [[ZN]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpmov_lane_u8(svuint8_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _u8)(zn, 0); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s8 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8( [[ZN]], i32 0) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z19test_svpmov_lane_s8u10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8( [[ZN]], i32 0) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbool_t test_svpmov_lane_s8(svint8_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _s8)(zn, 0); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_u16u12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_u16(svuint16_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _u16)(zn, 0); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( [[ZN]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_s16u11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( [[ZN]], i32 1) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_s16(svint16_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _s16)(zn, 1); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_u32u12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_u32(svuint32_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _u32)(zn, 0); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s32 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( [[ZN]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_s32u11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( [[ZN]], i32 3) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_s32(svint32_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _s32)(zn, 3); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u64 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( [[ZN]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_u64u12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( [[ZN]], i32 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_u64(svuint64_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _u64)(zn, 0); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s64 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( [[ZN]], i32 7) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z20test_svpmov_lane_s64u11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( [[ZN]], i32 7) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpmov_lane_s64(svint64_t zn) { + return SVE_ACLE_FUNC(svpmov_lane, _s64)(zn, 7); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c @@ -0,0 +1,275 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3 +#endif + +// _m + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u16_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16( [[ZN]], [[TMP0]], i32 1) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u16_mu12__SVUint16_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16( [[ZN]], [[TMP0]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svpmov_lane_u16_m(svuint16_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _u16, _m)(zn, pn, 1); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s16_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16( [[ZN]], [[TMP0]], i32 1) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s16_mu11__SVInt16_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16( [[ZN]], [[TMP0]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svpmov_lane_s16_m(svint16_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _s16, _m)(zn, pn, 1); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u32_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32( [[ZN]], [[TMP0]], i32 1) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u32_mu12__SVUint32_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32( [[ZN]], [[TMP0]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svpmov_lane_u32_m(svuint32_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _u32, _m)(zn, pn, 1); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s32_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32( [[ZN]], [[TMP0]], i32 3) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s32_mu11__SVInt32_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32( [[ZN]], [[TMP0]], i32 3) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svpmov_lane_s32_m(svint32_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _s32, _m)(zn, pn, 3); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u64_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64( [[ZN]], [[TMP0]], i32 1) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u64_mu12__SVUint64_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64( [[ZN]], [[TMP0]], i32 1) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svpmov_lane_u64_m(svuint64_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _u64, _m)(zn, pn, 1); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s64_m +// CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64( [[ZN]], [[TMP0]], i32 7) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s64_mu11__SVInt64_tu10__SVBool_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64( [[ZN]], [[TMP0]], i32 7) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svpmov_lane_s64_m(svint64_t zn, svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane, _s64, _m)(zn, pn, 7); +} + + +// _z + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u8_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8( [[PN]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z21test_svpmov_lane_u8_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8( [[PN]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svpmov_lane_u8_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_u8, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s8_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8( [[PN]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z21test_svpmov_lane_s8_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8( [[PN]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svpmov_lane_s8_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_s8, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u16_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u16_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint16_t test_svpmov_lane_u16_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_u16, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s16_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s16_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint16_t test_svpmov_lane_s16_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_s16, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u32_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u32_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint32_t test_svpmov_lane_u32_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_u32, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s32_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s32_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint32_t test_svpmov_lane_s32_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_s32, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_u64_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_u64_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svuint64_t test_svpmov_lane_u64_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_u64, , _z)(pn); +} + +// CHECK-LABEL: define dso_local @test_svpmov_lane_s64_z +// CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64( [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: define dso_local @_Z22test_svpmov_lane_s64_zu10__SVBool_t +// CPP-CHECK-SAME: ( [[PN:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PN]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64( [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svint64_t test_svpmov_lane_s64_z(svbool_t pn) { + return SVE_ACLE_FUNC(svpmov_lane_s64, , _z)(pn); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c @@ -0,0 +1,168 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svpsel_lane_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IDX:%.*]], 15 +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret [[TMP1]] +// +// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_b8u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IDX:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[P1:%.*]], [[P2:%.*]], i32 [[TMP0]]) +// CPP-CHECK-NEXT: ret [[TMP1]] +// +svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b8(p1, p2, idx, 15); +} + +// CHECK-LABEL: @test_svpsel_lane_b16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b16u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b16(p1, p2, idx, 7); +} + +// CHECK-LABEL: @test_svpsel_lane_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b32u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b32(p1, p2, idx, 3); +} + +// CHECK-LABEL: @test_svpsel_lane_b64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b64u10__SVBool_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[P1:%.*]], [[TMP0]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_b64(p1, p2, idx, 1); +} + +// CHECK-LABEL: @test_svpsel_lane_c8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 15 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[TMP0]], [[P2:%.*]], i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP2]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP3]] +// +// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_c8u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IDX:%.*]], 15 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.psel.nxv16i1( [[TMP0]], [[P2:%.*]], i32 [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP2]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP3]] +// +svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c8(p1, p2, idx, 15); +} + +// CHECK-LABEL: @test_svpsel_lane_c16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 7 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c16u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 7 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv8i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c16(p1, p2, idx, 7); +} + +// CHECK-LABEL: @test_svpsel_lane_c32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c32u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 3 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv4i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c32(p1, p2, idx, 3); +} + +// CHECK-LABEL: @test_svpsel_lane_c64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c64u11__SVCount_tu10__SVBool_tj( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[P2:%.*]]) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX:%.*]], 1 +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.psel.nxv2i1( [[TMP0]], [[TMP1]], i32 [[TMP2]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[TMP3]]) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP4]] +// +svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) { + return svpsel_lane_c64(p1, p2, idx, 1); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c @@ -1,10 +1,18 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -DIGNORE_STREAMING_ATTR -disable-O0-optnone -Werror -Wall -o /dev/null %s #include +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) // CHECK-LABEL: @test_svptrue_c8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -19,6 +27,7 @@ return svptrue_c8(); } +__attribute__((arm_streaming)) // CHECK-LABEL: @test_svptrue_c16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16() @@ -33,6 +42,7 @@ return svptrue_c16(); } +__attribute__((arm_streaming)) // CHECK-LABEL: @test_svptrue_c32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c32() @@ -47,6 +57,7 @@ return svptrue_c32(); } +__attribute__((arm_streaming)) // CHECK-LABEL: @test_svptrue_c64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c64() diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c @@ -0,0 +1,81 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + + +// SQRSHRN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrn_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_s16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svint16_t test_svqrshrn_s16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_s16_x2,,,)(zn, 16); +} + +// UQRSHRN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrn_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z20test_svqrshrn_u16_x212svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.uqrshrn.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svqrshrn_u16_x2(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svqrshrn,_u16_x2,,,)(zn, 16); +} + +// SQRSHRUN x 2 + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svqrshrun_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CHECK-NEXT: ret [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z21test_svqrshrun_u16_x211svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.sqrshrun.x2.nxv4i32( [[TMP0]], [[TMP1]], i32 16) +// CPP-CHECK-NEXT: ret [[TMP2]] +// +svuint16_t test_svqrshrun_u16_x2(svint32x2_t zn) { + return SVE_ACLE_FUNC(svsqrshrun,_u16_x2,,,)(zn, 16); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx2.c @@ -0,0 +1,396 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_s8_x2u11__SVCount_t10svint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x2_t test_svsel_s8_x2(svcount_t pn, svint8x2_t zn, svint8x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s8_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_u8_x2u11__SVCount_t11svuint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZM]], i64 16) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x2_t test_svsel_u8_x2(svcount_t pn, svuint8x2_t zn, svuint8x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u8_x2)(pn, zn, zm); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s16_x2u11__SVCount_t11svint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x2_t test_svsel_s16_x2(svcount_t pn, svint16x2_t zn, svint16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u16_x2u11__SVCount_t12svuint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x2_t test_svsel_u16_x2(svcount_t pn, svuint16x2_t zn, svuint16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f16_x2u11__SVCount_t13svfloat16x2_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x2_t test_svsel_f16_x2(svcount_t pn, svfloat16x2_t zn, svfloat16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f16_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x2u11__SVCount_t14svbfloat16x2_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x2_t test_svsel_bf16_x2(svcount_t pn, svbfloat16x2_t zn, svbfloat16x2_t zm) { + return SVE_ACLE_FUNC(svsel,_bf16_x2)(pn, zn, zm); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s32_x2u11__SVCount_t11svint32x2_t11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x2_t test_svsel_s32_x2(svcount_t pn, svint32x2_t zn, svint32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s32_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u32_x2u11__SVCount_t12svuint32x2_t12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x2_t test_svsel_u32_x2(svcount_t pn, svuint32x2_t zn, svuint32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u32_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f32_x2u11__SVCount_t13svfloat32x2_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZM]], i64 4) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x2_t test_svsel_f32_x2(svcount_t pn, svfloat32x2_t zn, svfloat32x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f32_x2)(pn, zn, zm); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s64_x2u11__SVCount_t11svint64x2_t11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x2_t test_svsel_s64_x2(svcount_t pn, svint64x2_t zn, svint64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_s64_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u64_x2u11__SVCount_t12svuint64x2_t12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x2_t test_svsel_u64_x2(svcount_t pn, svuint64x2_t zn, svuint64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_u64_x2)(pn, zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f64_x2u11__SVCount_t13svfloat64x2_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZN]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZM]], i64 2) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.sel.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP5]], i64 0) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP4]], 1 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP6]], [[TMP7]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x2_t test_svsel_f64_x2(svcount_t pn, svfloat64x2_t zn, svfloat64x2_t zm) { + return SVE_ACLE_FUNC(svsel,_f64_x2)(pn, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_selx4.c @@ -0,0 +1,588 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_s8_x4u11__SVCount_t10svint8x4_t10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint8x4_t test_svsel_s8_x4(svcount_t pn, svint8x4_t zn1, svint8x4_t zn2) { + return SVE_ACLE_FUNC(svsel,_s8_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z16test_svsel_u8_x4u11__SVCount_t11svuint8x4_t11svuint8x4_t11svuint8x4_t11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN1]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZN2]], i64 48) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP10]], [[TMP11]], i64 16) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP12]], [[TMP13]], i64 32) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP14]], [[TMP15]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint8x4_t test_svsel_u8_x4(svcount_t pn, svuint8x4_t zn1, svuint8x4_t zn2, svuint8x4_t zn3, svuint8x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u8_x4)(pn, zn1, zn2); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s16_x4u11__SVCount_t11svint16x4_t11svint16x4_t11svint16x4_t11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint16x4_t test_svsel_s16_x4(svcount_t pn, svint16x4_t zn1, svint16x4_t zn2, svint16x4_t zn3, svint16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u16_x4u11__SVCount_t12svuint16x4_t12svuint16x4_t12svuint16x4_t12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint16x4_t test_svsel_u16_x4(svcount_t pn, svuint16x4_t zn1, svuint16x4_t zn2, svuint16x4_t zn3, svuint16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f16_x4u11__SVCount_t13svfloat16x4_t13svfloat16x4_t13svfloat16x4_t13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat16x4_t test_svsel_f16_x4(svcount_t pn, svfloat16x4_t zn1, svfloat16x4_t zn2, svfloat16x4_t zn3, svfloat16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f16_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 24) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 8) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP12]], [[TMP13]], i64 16) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP14]], [[TMP15]], i64 24) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z18test_svsel_bf16_x4u11__SVCount_t14svbfloat16x4_t14svbfloat16x4_t14svbfloat16x4_t14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN1]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN2]], i64 24) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv8bf16(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP10]], [[TMP11]], i64 8) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP12]], [[TMP13]], i64 16) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP14]], [[TMP15]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svbfloat16x4_t test_svsel_bf16_x4(svcount_t pn, svbfloat16x4_t zn1, svbfloat16x4_t zn2, svbfloat16x4_t zn3, svbfloat16x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_bf16_x4)(pn, zn1, zn2); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s32_x4u11__SVCount_t11svint32x4_t11svint32x4_t11svint32x4_t11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint32x4_t test_svsel_s32_x4(svcount_t pn, svint32x4_t zn1, svint32x4_t zn2, svint32x4_t zn3, svint32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s32_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u32_x4u11__SVCount_t12svuint32x4_t12svuint32x4_t12svuint32x4_t12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint32x4_t test_svsel_u32_x4(svcount_t pn, svuint32x4_t zn1, svuint32x4_t zn2, svuint32x4_t zn3, svuint32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u32_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 12) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f32_x4u11__SVCount_t13svfloat32x4_t13svfloat32x4_t13svfloat32x4_t13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN1]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZN2]], i64 12) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP10]], [[TMP11]], i64 4) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP12]], [[TMP13]], i64 8) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP14]], [[TMP15]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat32x4_t test_svsel_f32_x4(svcount_t pn, svfloat32x4_t zn1, svfloat32x4_t zn2, svfloat32x4_t zn3, svfloat32x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f32_x4)(pn, zn1, zn2); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_s64_x4u11__SVCount_t11svint64x4_t11svint64x4_t11svint64x4_t11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svint64x4_t test_svsel_s64_x4(svcount_t pn, svint64x4_t zn1, svint64x4_t zn2, svint64x4_t zn3, svint64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_s64_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_u64_x4u11__SVCount_t12svuint64x4_t12svuint64x4_t12svuint64x4_t12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svuint64x4_t test_svsel_u64_x4(svcount_t pn, svuint64x4_t zn1, svuint64x4_t zn2, svuint64x4_t zn3, svuint64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_u64_x4)(pn, zn1, zn2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svsel_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2:%.*]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 2) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 6) +// CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CHECK-NEXT: ret [[TMP16]] +// +// CPP-CHECK-LABEL: @_Z17test_svsel_f64_x4u11__SVCount_t13svfloat64x4_t13svfloat64x4_t13svfloat64x4_t13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN1]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 2) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZN2]], i64 6) +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call { , , , } @llvm.aarch64.sve.sel.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP8]], 0 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP9]], i64 0) +// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP8]], 1 +// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP10]], [[TMP11]], i64 2) +// CPP-CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP8]], 2 +// CPP-CHECK-NEXT: [[TMP14:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP12]], [[TMP13]], i64 4) +// CPP-CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[TMP8]], 3 +// CPP-CHECK-NEXT: [[TMP16:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP14]], [[TMP15]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP16]] +// +svfloat64x4_t test_svsel_f64_x4(svcount_t pn, svfloat64x4_t zn1, svfloat64x4_t zn2, svfloat64x4_t zn3, svfloat64x4_t zn4) { + return SVE_ACLE_FUNC(svsel,_f64_x4)(pn, zn1, zn2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c @@ -0,0 +1,1050 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -DIGNORE_STREAMING_ATTR -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -DIGNORE_STREAMING_ATTR -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_u8_x2u11__SVCount_tPh11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u16_x2u11__SVCount_tPt12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u32_x2u11__SVCount_tPj12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u64_x2u11__SVCount_tPm12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_u64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_u8_x4u11__SVCount_tPh11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u16_x4u11__SVCount_tPt12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u32_x4u11__SVCount_tPj12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_u64_x4u11__SVCount_tPm12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_u64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_s8_x2u11__SVCount_tPa10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s16_x2u11__SVCount_tPs11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s32_x2u11__SVCount_tPi11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s64_x2u11__SVCount_tPl11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_s64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst1_s8_x4u11__SVCount_tPa10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s16_x4u11__SVCount_tPs11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s32_x4u11__SVCount_tPi11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_s64_x4u11__SVCount_tPl11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_s64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f16_x2u11__SVCount_tPDh13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f32_x2u11__SVCount_tPf13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f64_x2u11__SVCount_tPd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svst1,_f64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f16_x4u11__SVCount_tPDh13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f32_x4u11__SVCount_tPf13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z17test_svst1_f64_x4u11__SVCount_tPd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svst1,_f64_x4,,)(pn, base, v); +} + + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_u64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_s64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svst1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svst1_vnum,_f64_x4,,)(pn, base, vnum, v); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c @@ -0,0 +1,255 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// ST1W + +// CHECK-LABEL: define dso_local void @test_svst1uwq_u32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_u32u10__SVBool_tPKju12__SVUint32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) { + SVE_ACLE_FUNC(svst1uwq, _u32, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_u32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_u32u10__SVBool_tPKju12__SVUint32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_vnum_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) { + SVE_ACLE_FUNC(svst1uwq_vnum, _u32, , )(pred, base, 1, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1uwq_s32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_s32u10__SVBool_tPKiu11__SVInt32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_s32(svbool_t pred, int32_t const * base, svint32_t zt) { + SVE_ACLE_FUNC(svst1uwq, _s32, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_s32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_s32u10__SVBool_tPKiu11__SVInt32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4i32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_vnum_s32(svbool_t pred, int32_t const * base, svint32_t zt) { + SVE_ACLE_FUNC(svst1uwq_vnum, _s32, , )(pred, base, 1, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1uwq_f32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1uwq_f32u10__SVBool_tPKfu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) { + SVE_ACLE_FUNC(svst1uwq, _f32, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1uwq_vnum_f32 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1uwq_vnum_f32u10__SVBool_tPKfu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 1 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1uwq.nxv4f32( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1uwq_vnum_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) { + SVE_ACLE_FUNC(svst1uwq_vnum, _f32, , )(pred, base, 1, zt); +} + + +// ST1D + +// CHECK-LABEL: define dso_local void @test_svst1udq_u64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_u64u10__SVBool_tPKmu12__SVUint64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) { + SVE_ACLE_FUNC(svst1udq, _u64, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_u64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_u64u10__SVBool_tPKmu12__SVUint64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_vnum_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) { + SVE_ACLE_FUNC(svst1udq_vnum, _u64, , )(pred, base, -8, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1udq_s64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_s64u10__SVBool_tPKlu11__SVInt64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_s64(svbool_t pred, int64_t const * base, svint64_t zt) { + SVE_ACLE_FUNC(svst1udq, _s64, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_s64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_s64u10__SVBool_tPKlu11__SVInt64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2i64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_vnum_s64(svbool_t pred, int64_t const * base, svint64_t zt) { + SVE_ACLE_FUNC(svst1udq_vnum, _s64, , )(pred, base, -8, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1udq_f64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z17test_svst1udq_f64u10__SVBool_tPKdu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64( [[ZT]], [[TMP0]], ptr [[BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) { + SVE_ACLE_FUNC(svst1udq, _f64, , )(pred, base, zt); +} + +// CHECK-LABEL: define dso_local void @test_svst1udq_vnum_f64 +// CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z22test_svst1udq_vnum_f64u10__SVBool_tPKdu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[PRED:%.*]], ptr noundef [[BASE:%.*]], [[ZT:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PRED]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[BASE]], i64 -8 +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1udq.nxv2f64( [[ZT]], [[TMP0]], ptr [[TMP1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1udq_vnum_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) { + SVE_ACLE_FUNC(svst1udq_vnum, _f64, , )(pred, base, -8, zt); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c @@ -0,0 +1,1049 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -DIGNORE_STREAMING_ATTR -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_u8_x2u11__SVCount_tPh11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u16_x2u11__SVCount_tPt12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u32_x2u11__SVCount_tPj12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u64_x2u11__SVCount_tPm12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_u8_x4u11__SVCount_tPh11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u16_x4u11__SVCount_tPt12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u32_x4u11__SVCount_tPj12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_u64_x4u11__SVCount_tPm12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_u64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_s8_x2u11__SVCount_tPa10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s8_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s16_x2u11__SVCount_tPs11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s32_x2u11__SVCount_tPi11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s64_x2u11__SVCount_tPl11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svstnt1_s8_x4u11__SVCount_tPa10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s8_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s16_x4u11__SVCount_tPs11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s32_x4u11__SVCount_tPi11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_s64_x4u11__SVCount_tPl11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_s64_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f16_x2u11__SVCount_tPDh13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f16_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f32_x2u11__SVCount_tPf13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f32_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f64_x2u11__SVCount_tPd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f64_x2,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f16_x4u11__SVCount_tPDh13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f16_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f32_x4u11__SVCount_tPf13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f32_x4,,)(pn, base, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svstnt1_f64_x4u11__SVCount_tPd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v); +} + +// == VNUM variants == + + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[V]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64( [[TMP0]], [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x2,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[V]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[V]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x4,,)(pn, base, vnum, v); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svstnt1_vnum_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64 +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[V]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[CONV]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) +{ + return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x4,,)(pn, base, vnum, v); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c @@ -0,0 +1,2124 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svst2q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst2q_u8u10__SVBool_tPKh11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_u8(svbool_t pg, const uint8_t *base, svuint8x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_u8,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst2q_s8u10__SVBool_tPKa10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_s8(svbool_t pg, const int8_t *base, svint8x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_s8,)(pg, base, zt); +} +// CHECK-LABEL: @test_svst2q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_u16u10__SVBool_tPKt12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_u16(svbool_t pg, const uint16_t *base, svuint16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_u16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_s16u10__SVBool_tPKs11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_s16(svbool_t pg, const int16_t *base, svint16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_s16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_u32u10__SVBool_tPKj12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_u32(svbool_t pg, const uint32_t *base, svuint32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_u32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_s32u10__SVBool_tPKi11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_s32(svbool_t pg, const int32_t *base, svint32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_s32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_u64u10__SVBool_tPKm12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_u64(svbool_t pg, const uint64_t *base, svuint64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_u64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_s64u10__SVBool_tPKl11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_s64(svbool_t pg, const int64_t *base, svint64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_s64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_f16u10__SVBool_tPKDh13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_f16(svbool_t pg, const float16_t *base, svfloat16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_f16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst2q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_bf16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_f32u10__SVBool_tPKf13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_f32(svbool_t pg, const float32_t *base, svfloat32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_f32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst2q_f64u10__SVBool_tPKd13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_f64(svbool_t pg, const float64_t *base, svfloat64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q,,_f64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_u8u10__SVBool_tPKhl11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum_,,u8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_s8u10__SVBool_tPKal10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP2]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_s8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u16u10__SVBool_tPKtl12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_u16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s16u10__SVBool_tPKsl11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_s16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u32u10__SVBool_tPKjl12svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_u32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s32u10__SVBool_tPKil11svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_s32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u64u10__SVBool_tPKml12svuint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_u64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s64u10__SVBool_tPKll11svint64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv4i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_s64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f16u10__SVBool_tPKDhl13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_f16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst2q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_bf16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f32u10__SVBool_tPKfl13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv8f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_f32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst2q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f64u10__SVBool_tPKdl13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv4f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x2_t zt) +{ + return SVE_ACLE_FUNC(svst2q_vnum,,_f64,)(pg, base, vnum, zt); +} + +// +// ST3Q +// CHECK-LABEL: @test_svst3q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst3q_u8u10__SVBool_tPKh11svuint8x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_u8(svbool_t pg, const uint8_t *base, svuint8x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_u8,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst3q_s8u10__SVBool_tPKa10svint8x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_s8(svbool_t pg, const int8_t *base, svint8x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_s8,)(pg, base, zt); +} +// CHECK-LABEL: @test_svst3q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_u16u10__SVBool_tPKt12svuint16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_u16(svbool_t pg, const uint16_t *base, svuint16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_u16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_s16u10__SVBool_tPKs11svint16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_s16(svbool_t pg, const int16_t *base, svint16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_s16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_u32u10__SVBool_tPKj12svuint32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_u32(svbool_t pg, const uint32_t *base, svuint32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_u32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_s32u10__SVBool_tPKi11svint32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_s32(svbool_t pg, const int32_t *base, svint32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_s32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_u64u10__SVBool_tPKm12svuint64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_u64(svbool_t pg, const uint64_t *base, svuint64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_u64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_s64u10__SVBool_tPKl11svint64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_s64(svbool_t pg, const int64_t *base, svint64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_s64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_f16u10__SVBool_tPKDh13svfloat16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_f16(svbool_t pg, const float16_t *base, svfloat16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_f16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst3q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_bf16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_f32u10__SVBool_tPKf13svfloat32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_f32(svbool_t pg, const float32_t *base, svfloat32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_f32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst3q_f64u10__SVBool_tPKd13svfloat64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q,,_f64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_u8u10__SVBool_tPKhl11svuint8x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum_,,u8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_s8u10__SVBool_tPKal10svint8x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_s8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u16u10__SVBool_tPKtl12svuint16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_u16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s16u10__SVBool_tPKsl11svint16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv24i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_s16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u32u10__SVBool_tPKjl12svuint32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_u32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s32u10__SVBool_tPKil11svint32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv12i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_s32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u64u10__SVBool_tPKml12svuint64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_u64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s64u10__SVBool_tPKll11svint64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv6i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_s64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f16u10__SVBool_tPKDhl13svfloat16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv24f16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_f16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst3q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv24bf16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_bf16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f32u10__SVBool_tPKfl13svfloat32x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv12f32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_f32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst3q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f64u10__SVBool_tPKdl13svfloat64x3_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv6f64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x3_t zt) +{ + return SVE_ACLE_FUNC(svst3q_vnum,,_f64,)(pg, base, vnum, zt); +} + +// +// ST4Q +// CHECK-LABEL: @test_svst4q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst4q_u8u10__SVBool_tPKh11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_u8(svbool_t pg, const uint8_t *base, svuint8x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_u8,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z14test_svst4q_s8u10__SVBool_tPKa10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_s8(svbool_t pg, const int8_t *base, svint8x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_s8,)(pg, base, zt); +} +// CHECK-LABEL: @test_svst4q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_u16u10__SVBool_tPKt12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_u16(svbool_t pg, const uint16_t *base, svuint16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_u16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_s16u10__SVBool_tPKs11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_s16(svbool_t pg, const int16_t *base, svint16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_s16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_u32u10__SVBool_tPKj12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_u32(svbool_t pg, const uint32_t *base, svuint32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_u32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_s32u10__SVBool_tPKi11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_s32(svbool_t pg, const int32_t *base, svint32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_s32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_u64u10__SVBool_tPKm12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_u64(svbool_t pg, const uint64_t *base, svuint64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_u64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_s64u10__SVBool_tPKl11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_s64(svbool_t pg, const int64_t *base, svint64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_s64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_f16u10__SVBool_tPKDh13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_f16(svbool_t pg, const float16_t *base, svfloat16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_f16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z16test_svst4q_bf16u10__SVBool_tPKu6__bf1614svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_bf16(svbool_t pg, const bfloat16_t *base, svbfloat16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_bf16,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_f32u10__SVBool_tPKf13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_f32(svbool_t pg, const float32_t *base, svfloat32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_f32,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z15test_svst4q_f64u10__SVBool_tPKd13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[BASE:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q,,_f64,)(pg, base, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_u8u10__SVBool_tPKhl11svuint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum_,,u8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_s8u10__SVBool_tPKal10svint8x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 32) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP4]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_s8,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u16u10__SVBool_tPKtl12svuint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_u16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s16u10__SVBool_tPKsl11svint16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv32i16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_s16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u32u10__SVBool_tPKjl12svuint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_u32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s32u10__SVBool_tPKil11svint32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv16i32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_s32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u64u10__SVBool_tPKml12svuint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_u64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s64u10__SVBool_tPKll11svint64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i64.nxv8i64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_s64,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f16u10__SVBool_tPKDhl13svfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_f16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z21test_svst4q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 16) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZT]], i64 24) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_bf16,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 8) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 12) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f32u10__SVBool_tPKfl13svfloat32x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 8) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[ZT]], i64 12) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_f32,)(pg, base, vnum, zt); +} + +// CHECK-LABEL: @test_svst4q_vnum_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 2) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 4) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 6) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f64u10__SVBool_tPKdl13svfloat64x4_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 2) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 4) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2f64.nxv8f64( [[ZT]], i64 6) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr , ptr [[BASE:%.*]], i64 [[VNUM:%.*]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP5]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x4_t zt) +{ + return SVE_ACLE_FUNC(svst4q_vnum,,_f64,)(pg, base, vnum, zt); +} + +// Scatter for 128 bits +// vector base + scalar offset +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u64u10__SVBool_tu12__SVUint64_tlu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_u64(svbool_t pg, svuint64_t base, int64_t offset, svuint64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u64)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s64u10__SVBool_tu12__SVUint64_tlu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_s64(svbool_t pg, svuint64_t base, int64_t offset, svint64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s64)(pg, base, offset, data); +} + + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u32u10__SVBool_tu12__SVUint64_tlu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_u32(svbool_t pg, svuint64_t base, int64_t offset, svuint32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u32)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s32u10__SVBool_tu12__SVUint64_tlu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_s32(svbool_t pg, svuint64_t base, int64_t offset, svint32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s32)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_u16u10__SVBool_tu12__SVUint64_tlu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_u16(svbool_t pg, svuint64_t base, int64_t offset, svuint16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u16)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_s16u10__SVBool_tu12__SVUint64_tlu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_s16(svbool_t pg, svuint64_t base, int64_t offset, svint16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s16)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_offset_u8u10__SVBool_tu12__SVUint64_tlu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_u8(svbool_t pg, svuint64_t base, int64_t offset, svuint8_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _u8)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z37test_svst1q_scatter_u64base_offset_s8u10__SVBool_tu12__SVUint64_tlu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_s8(svbool_t pg, svuint64_t base, int64_t offset, svint8_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _s8)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f64u10__SVBool_tu12__SVUint64_tlu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_f64(svbool_t pg, svuint64_t base, int64_t offset, svfloat64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f64)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f32u10__SVBool_tu12__SVUint64_tlu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_f32(svbool_t pg, svuint64_t base, int64_t offset, svfloat32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f32)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z38test_svst1q_scatter_u64base_offset_f16u10__SVBool_tu12__SVUint64_tlu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_f16(svbool_t pg, svuint64_t base, int64_t offset, svfloat16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _f16)(pg, base, offset, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_offset_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z39test_svst1q_scatter_u64base_offset_bf16u10__SVBool_tu12__SVUint64_tlu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 [[OFFSET:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_offset_bf16(svbool_t pg, svuint64_t base, int64_t offset, svbfloat16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base, _offset, _bf16)(pg, base, offset, data); +} + +// Vector Base and no Offset +// CHECK-LABEL: @test_svst1q_scatter_u64base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u64u10__SVBool_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_u64(svbool_t pg, svuint64_t base, svuint64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u64)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s64u10__SVBool_tu12__SVUint64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_s64(svbool_t pg, svuint64_t base, svint64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s64)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u32u10__SVBool_tu12__SVUint64_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_u32(svbool_t pg, svuint64_t base, svuint32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u32)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s32u10__SVBool_tu12__SVUint64_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_s32(svbool_t pg, svuint64_t base, svint32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s32)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_u16u10__SVBool_tu12__SVUint64_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_u16(svbool_t pg, svuint64_t base, svuint16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u16)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_s16u10__SVBool_tu12__SVUint64_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_s16(svbool_t pg, svuint64_t base, svint16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s16)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svst1q_scatter_u64base_u8u10__SVBool_tu12__SVUint64_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_u8(svbool_t pg, svuint64_t base, svuint8_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _u8)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z30test_svst1q_scatter_u64base_s8u10__SVBool_tu12__SVUint64_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( [[DATA:%.*]], [[PG:%.*]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_s8(svbool_t pg, svuint64_t base, svint8_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _s8)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f64u10__SVBool_tu12__SVUint64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_f64(svbool_t pg, svuint64_t base, svfloat64_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f64)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f32u10__SVBool_tu12__SVUint64_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_f32(svbool_t pg, svuint64_t base, svfloat32_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f32)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svst1q_scatter_u64base_f16u10__SVBool_tu12__SVUint64_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_f16(svbool_t pg, svuint64_t base, svfloat16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,, _f16)(pg, base, data); +} + +// CHECK-LABEL: @test_svst1q_scatter_u64base_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_u64base_bf16u10__SVBool_tu12__SVUint64_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64( [[DATA:%.*]], [[TMP0]], [[BASE:%.*]], i64 0) +// CPP-CHECK-NEXT: ret void +// +void test_svst1q_scatter_u64base_bf16(svbool_t pg, svuint64_t base, svbfloat16_t data) +{ + return SVE_ACLE_FUNC(svst1q_scatter, _u64base,,_bf16)(pg, base, data); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c @@ -0,0 +1,214 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svtblq_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z14test_svtblq_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svtblq_u8(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svtblq, _u8,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svtblq_u16(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtblq, _u16,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svtblq_u32(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtblq, _u32,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svtblq_u64(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtblq, _u64,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z14test_svtblq_s8u10__SVInt8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svtblq_s8(svint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svtblq, _s8,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_s16u11__SVInt16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svtblq_s16(svint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtblq, _s16,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_s32u11__SVInt32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svtblq_s32(svint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtblq, _s32,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_s64u11__SVInt64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svtblq_s64(svint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtblq, _s64,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8f16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_f16u13__SVFloat16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8f16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svtblq_f16(svfloat16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtblq, _f16,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4f32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_f32u13__SVFloat32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv4f32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svtblq_f32(svfloat32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtblq, _f32,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2f64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtblq_f64u13__SVFloat64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv2f64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svtblq_f64(svfloat64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtblq, _f64,,)(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtblq_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8bf16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svtblq_bf16u14__SVBFloat16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tblq.nxv8bf16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svtblq_bf16(svbfloat16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtblq, _bf16,,)(zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c @@ -0,0 +1,214 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svtbxq_u8 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv16i8( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z14test_svtbxq_u8u11__SVUint8_tu11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv16i8( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svtbxq_u8(svuint8_t passthru, svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svtbxq, _u8,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_u16 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8i16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_u16u12__SVUint16_tu12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8i16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svtbxq_u16(svuint16_t passthru, svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtbxq, _u16,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_u32 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4i32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_u32u12__SVUint32_tu12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4i32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svtbxq_u32(svuint32_t passthru, svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtbxq, _u32,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_u64 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2i64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_u64u12__SVUint64_tu12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2i64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svtbxq_u64(svuint64_t passthru, svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtbxq, _u64,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_s8 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv16i8( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z14test_svtbxq_s8u10__SVInt8_tu10__SVInt8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv16i8( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svtbxq_s8(svint8_t passthru, svint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svtbxq, _s8,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_s16 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8i16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_s16u11__SVInt16_tu11__SVInt16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8i16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svtbxq_s16(svint16_t passthru, svint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtbxq, _s16,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_s32 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4i32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_s32u11__SVInt32_tu11__SVInt32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4i32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svtbxq_s32(svint32_t passthru, svint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtbxq, _s32,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_s64 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2i64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_s64u11__SVInt64_tu11__SVInt64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2i64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svtbxq_s64(svint64_t passthru, svint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtbxq, _s64,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_f16 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8f16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_f16u13__SVFloat16_tu13__SVFloat16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8f16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svtbxq_f16(svfloat16_t passthru, svfloat16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtbxq, _f16,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_f32 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4f32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_f32u13__SVFloat32_tu13__SVFloat32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv4f32( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svtbxq_f32(svfloat32_t passthru, svfloat32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svtbxq, _f32,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_f64 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2f64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svtbxq_f64u13__SVFloat64_tu13__SVFloat64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv2f64( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svtbxq_f64(svfloat64_t passthru, svfloat64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svtbxq, _f64,,)(passthru, zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svtbxq_bf16 +// CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8bf16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svtbxq_bf16u14__SVBFloat16_tu14__SVBFloat16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[PASSTHRU:%.*]], [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.tbxq.nxv8bf16( [[PASSTHRU]], [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svtbxq_bf16(svbfloat16_t passthru, svbfloat16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svtbxq, _bf16,,)(passthru, zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx2.c @@ -0,0 +1,155 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x2u10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svunpk_s16_x2(svint8_t zn) { + return SVE_ACLE_FUNC(svunpk,_s16_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x2u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svunpk_u16_x2(svuint8_t zn) { + return SVE_ACLE_FUNC(svunpk,_u16_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x2u11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svunpk_s32_x2(svint16_t zn) { + return SVE_ACLE_FUNC(svunpk,_s32_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x2u12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svunpk_u32_x2(svuint16_t zn) { + return SVE_ACLE_FUNC(svunpk,_u32_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x2u11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svunpk_s64_x2(svint32_t zn) { + return SVE_ACLE_FUNC(svunpk,_s64_x2)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64( [[ZN:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x2u12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64( [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svunpk_u64_x2(svuint32_t zn) { + return SVE_ACLE_FUNC(svunpk,_u64_x2)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_unpkx4.c @@ -0,0 +1,227 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1,A2) A1##A2 +#endif + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s16_x410svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint16x4_t test_svunpk_s16_x4(svint8x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s16_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u16_x411svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZN]], i64 16) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 16) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP8]], [[TMP9]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint16x4_t test_svunpk_u16_x4(svuint8x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u16_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s32_x411svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint32x4_t test_svunpk_s32_x4(svint16x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s32_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u32_x412svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8i16.nxv16i16( [[ZN]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 8) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP9]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint32x4_t test_svunpk_u32_x4(svuint16x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u32_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_s64_x411svint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svint64x4_t test_svunpk_s64_x4(svint32x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_s64_x4)(zn); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svunpk_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CHECK-NEXT: ret [[TMP10]] +// +// CPP-CHECK-LABEL: @_Z18test_svunpk_u64_x412svuint32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv4i32.nxv8i32( [[ZN]], i64 4) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64( [[TMP0]], [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP2]], 0 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP3]], i64 0) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP2]], 1 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 2) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP2]], 2 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 4) +// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[TMP2]], 3 +// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP8]], [[TMP9]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP10]] +// +svuint64x4_t test_svunpk_u64_x4(svuint32x2_t zn) { + return SVE_ACLE_FUNC(svunpk,_u64_x4)(zn); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c @@ -0,0 +1,217 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svuzpq1_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svuzpq1_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svuzpq1_u8(svuint8_t zn, svuint8_t zm) { + return svuzpq1_u8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svuzpq1_u16(svuint16_t zn, svuint16_t zm) { + return svuzpq1_u16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svuzpq1_u32(svuint32_t zn, svuint32_t zm) { + return svuzpq1_u32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svuzpq1_u64(svuint64_t zn, svuint64_t zm) { + return svuzpq1_u64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svuzpq1_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svuzpq1_s8u10__SVInt8_tu10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svuzpq1_s8(svint8_t zn, svint8_t zm) { + return svuzpq1_s8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_s16u11__SVInt16_tu11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svuzpq1_s16(svint16_t zn, svint16_t zm) { + return svuzpq1_s16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_s32u11__SVInt32_tu11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svuzpq1_s32(svint32_t zn, svint32_t zm) { + return svuzpq1_s32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_s64u11__SVInt64_tu11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svuzpq1_s64(svint64_t zn, svint64_t zm) { + return svuzpq1_s64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svuzpq1_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8f16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_f16u13__SVFloat16_tu13__SVFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8f16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svuzpq1_f16(svfloat16_t zn, svfloat16_t zm) { + return svuzpq1_f16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4f32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_f32u13__SVFloat32_tu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv4f32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svuzpq1_f32(svfloat32_t zn, svfloat32_t zm) { + return svuzpq1_f32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2f64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq1_f64u13__SVFloat64_tu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv2f64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svuzpq1_f64(svfloat64_t zn, svfloat64_t zm) { + return svuzpq1_f64(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq1_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8bf16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svuzpq1_bf16u14__SVBFloat16_tu14__SVBFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq1.nxv8bf16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svuzpq1_bf16(svbfloat16_t zn, svbfloat16_t zm) { + return svuzpq1_bf16(zn, zm); +} + + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c @@ -0,0 +1,216 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svuzpq2_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svuzpq2_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svuzpq2_u8(svuint8_t zn, svuint8_t zm) { + return svuzpq2_u8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svuzpq2_u16(svuint16_t zn, svuint16_t zm) { + return svuzpq2_u16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svuzpq2_u32(svuint32_t zn, svuint32_t zm) { + return svuzpq2_u32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svuzpq2_u64(svuint64_t zn, svuint64_t zm) { + return svuzpq2_u64(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svuzpq2_s8u10__SVInt8_tu10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svuzpq2_s8(svint8_t zn, svint8_t zm) { + return svuzpq2_s8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_s16u11__SVInt16_tu11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svuzpq2_s16(svint16_t zn, svint16_t zm) { + return svuzpq2_s16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_s32u11__SVInt32_tu11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svuzpq2_s32(svint32_t zn, svint32_t zm) { + return svuzpq2_s32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_s64u11__SVInt64_tu11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svuzpq2_s64(svint64_t zn, svint64_t zm) { + return svuzpq2_s64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svuzpq2_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8f16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_f16u13__SVFloat16_tu13__SVFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8f16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svuzpq2_f16(svfloat16_t zn, svfloat16_t zm) { + return svuzpq2_f16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4f32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_f32u13__SVFloat32_tu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv4f32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svuzpq2_f32(svfloat32_t zn, svfloat32_t zm) { + return svuzpq2_f32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2f64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svuzpq2_f64u13__SVFloat64_tu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv2f64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svuzpq2_f64(svfloat64_t zn, svfloat64_t zm) { + return svuzpq2_f64(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svuzpq2_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8bf16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svuzpq2_bf16u14__SVBFloat16_tu14__SVBFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.uzpq2.nxv8bf16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svuzpq2_bf16(svbfloat16_t zn, svbfloat16_t zm) { + return svuzpq2_bf16(zn, zm); +} + + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx2.c @@ -0,0 +1,579 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svuzp_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svuzp,_s8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svuzp_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svuzp,_u8,,,_x2)(zn, zm); +} + +// 16-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svuzp_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svuzp,_s16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svuzp_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svuzp,_u16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svuzp_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svuzp,_f16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svuzp_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svuzp,_bf16,,,_x2)(zn, zm); +} + +// 32-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svuzp_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svuzp,_s32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svuzp_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svuzp,_u32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svuzp_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svuzp,_f32,,,_x2)(zn, zm); +} + +// 64-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svuzp_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svuzp,_s64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svuzp_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svuzp,_u64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svuzp_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svuzp,_f64,,,_x2)(zn, zm); +} + +// 128-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svuzpq_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svuzpq_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u8,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svuzpq_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svuzpq_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svuzpq_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svuzpq_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svuzpq,_bf16,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svuzpq_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svuzpq_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svuzpq_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f32,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svuzpq_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_s64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svuzpq_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_u64,,,_x2)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svuzpq_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svuzpq,_f64,,,_x2)(zn, zm); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpx4.c @@ -0,0 +1,771 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svuzp_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svuzp_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svuzp_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 16-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svuzp_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svuzp_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svuzp_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzp_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svuzp_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzp,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 32-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svuzp_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svuzp_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svuzp_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 64-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svuzp_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svuzp_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzp_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzp_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svuzp_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svuzp,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 128-bit UZPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svuzpq_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svuzpq_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svuzpq_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svuzpq_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svuzpq_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svuzpq_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svuzpq_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svuzpq_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svuzpq_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svuzpq_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svuzpq_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svuzpq_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svuzpq_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svuzpq_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svuzpq_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svuzpq_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svuzpq,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c @@ -0,0 +1,1062 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -DIGNORE_STREAMING_ATTR -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -DIGNORE_STREAMING_ATTR -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef IGNORE_STREAMING_ATTR +#define __attribute__(...) +#endif + + +// WHILEGE + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilege_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilege_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilege_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilege.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilege_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilege_c64(op1, op2, 4); +} + + +// WHILEGT + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilegt_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilegt_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilegt_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilegt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilegt_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilegt_c64(op1, op2, 4); +} + + +// WHILEHI + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehi_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehi_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehi.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehi_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehi_c64(op1, op2, 4); +} + + +// WHILEHS + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilehs_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilehs_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilehs.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilehs_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilehs_c64(op1, op2, 4); +} + + +// WHILELE + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilele_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilele_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilele_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilele.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilele_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilele_c64(op1, op2, 4); +} + + +// WHILELO + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelo_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelo_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelo.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelo_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilelo_c64(op1, op2, 4); +} + + +// WHILELS + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_c8_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c8_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_c8_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c8_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c16_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c16_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c16_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c16_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c32_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c32_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c32_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c32_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c64_vl2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c64_vl2(uint64_t op1, uint64_t op2) +{ + return svwhilels_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilels_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilels_c64_vl4mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilels.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilels_c64_vl4(uint64_t op1, uint64_t op2) +{ + return svwhilels_c64(op1, op2, 4); +} + + +// WHILELT + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c8_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_c8_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c8_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c8(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c8_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_c8_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c8_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c8(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c16_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c16_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c16_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c16(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c16_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c16_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c16(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c16_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c16(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c32_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c32_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c32_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c32(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c32_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c32_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c32_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c32(op1, op2, 4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c64_vl2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c64_vl2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 2) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c64_vl2(int64_t op1, int64_t op2) +{ + return svwhilelt_c64(op1, op2, 2); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svwhilelt_c64_vl4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svwhilelt_c64_vl4ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c64(i64 [[OP1:%.*]], i64 [[OP2:%.*]], i32 4) +// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] +// +svcount_t test_svwhilelt_c64_vl4(int64_t op1, int64_t op2) +{ + return svwhilelt_c64(op1, op2, 4); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pp.c @@ -0,0 +1,896 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// REQUIRES: aarch64-registered-target +#include + + +// WHILEGE + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilege_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilege_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilege_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilege_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilege_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilege_b64_x2(op1, op2); +} + + +// WHILEGT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilegt_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilegt_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilegt_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilegt_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilegt_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilegt_b64_x2(op1, op2); +} + + +// WHILEHI + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilehi_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilehi_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehi_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehi_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehi_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehi_b64_x2(op1, op2); +} + + +// WHILEHS + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilehs_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilehs_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilehs_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilehs_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilehs_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilehs_b64_x2(op1, op2); +} + + +// WHILELE + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilele_b8_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilele_b8_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b16_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b16_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b32_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b32_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilele_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilele_b64_x2ll( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilele_b64_x2(int64_t op1, int64_t op2) +{ + return svwhilele_b64_x2(op1, op2); +} + + +// WHILELO + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilelo_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilelo_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelo_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelo_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelo_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelo_b64_x2(op1, op2); +} + + +// WHILELS + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilels_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilels_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilels_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilels_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilels_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilels_b64_x2(op1, op2); +} + + +// WHILELT + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z20test_svwhilelt_b8_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svboolx2_t test_svwhilelt_b8_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b8_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b16_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b16_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b16_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b32_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b32_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b32_x2(op1, op2); +} + +__attribute__((arm_streaming_compatible)) +// CHECK-LABEL: @test_svwhilelt_b64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CHECK-NEXT: ret [[TMP6]] +// +// CPP-CHECK-LABEL: @_Z21test_svwhilelt_b64_x2mm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 [[OP1:%.*]], i64 [[OP2:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP1]]) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[TMP2]], i64 0) +// CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( [[TMP4]]) +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP3]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP6]] +// +svboolx2_t test_svwhilelt_b64_x2(uint64_t op1, uint64_t op2) +{ + return svwhilelt_b64_x2(op1, op2); +} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c @@ -0,0 +1,217 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svzipq1_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svzipq1_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svzipq1_u8(svuint8_t zn, svuint8_t zm) { + return svzipq1_u8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svzipq1_u16(svuint16_t zn, svuint16_t zm) { + return svzipq1_u16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svzipq1_u32(svuint32_t zn, svuint32_t zm) { + return svzipq1_u32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svzipq1_u64(svuint64_t zn, svuint64_t zm) { + return svzipq1_u64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svzipq1_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svzipq1_s8u10__SVInt8_tu10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svzipq1_s8(svint8_t zn, svint8_t zm) { + return svzipq1_s8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_s16u11__SVInt16_tu11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svzipq1_s16(svint16_t zn, svint16_t zm) { + return svzipq1_s16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_s32u11__SVInt32_tu11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svzipq1_s32(svint32_t zn, svint32_t zm) { + return svzipq1_s32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_s64u11__SVInt64_tu11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svzipq1_s64(svint64_t zn, svint64_t zm) { + return svzipq1_s64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svzipq1_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8f16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_f16u13__SVFloat16_tu13__SVFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8f16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svzipq1_f16(svfloat16_t zn, svfloat16_t zm) { + return svzipq1_f16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4f32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_f32u13__SVFloat32_tu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv4f32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svzipq1_f32(svfloat32_t zn, svfloat32_t zm) { + return svzipq1_f32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2f64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq1_f64u13__SVFloat64_tu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv2f64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svzipq1_f64(svfloat64_t zn, svfloat64_t zm) { + return svzipq1_f64(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq1_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8bf16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svzipq1_bf16u14__SVBFloat16_tu14__SVBFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq1.nxv8bf16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svzipq1_bf16(svbfloat16_t zn, svbfloat16_t zm) { + return svzipq1_bf16(zn, zm); +} + + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c @@ -0,0 +1,217 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\ +// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_svzipq2_u8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svzipq2_u8u11__SVUint8_tu11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint8_t test_svzipq2_u8(svuint8_t zn, svuint8_t zm) { + return svzipq2_u8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_u16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_u16u12__SVUint16_tu12__SVUint16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint16_t test_svzipq2_u16(svuint16_t zn, svuint16_t zm) { + return svzipq2_u16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_u32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_u32u12__SVUint32_tu12__SVUint32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint32_t test_svzipq2_u32(svuint32_t zn, svuint32_t zm) { + return svzipq2_u32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_u64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_u64u12__SVUint64_tu12__SVUint64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svuint64_t test_svzipq2_u64(svuint64_t zn, svuint64_t zm) { + return svzipq2_u64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svzipq2_s8 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv16i8( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z15test_svzipq2_s8u10__SVInt8_tu10__SVInt8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv16i8( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint8_t test_svzipq2_s8(svint8_t zn, svint8_t zm) { + return svzipq2_s8(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_s16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8i16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_s16u11__SVInt16_tu11__SVInt16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8i16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint16_t test_svzipq2_s16(svint16_t zn, svint16_t zm) { + return svzipq2_s16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_s32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4i32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_s32u11__SVInt32_tu11__SVInt32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4i32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint32_t test_svzipq2_s32(svint32_t zn, svint32_t zm) { + return svzipq2_s32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_s64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2i64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_s64u11__SVInt64_tu11__SVInt64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2i64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svint64_t test_svzipq2_s64(svint64_t zn, svint64_t zm) { + return svzipq2_s64(zn, zm); +} + + +// CHECK-LABEL: define dso_local @test_svzipq2_f16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8f16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_f16u13__SVFloat16_tu13__SVFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8f16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_svzipq2_f16(svfloat16_t zn, svfloat16_t zm) { + return svzipq2_f16(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_f32 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4f32( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_f32u13__SVFloat32_tu13__SVFloat32_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv4f32( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_svzipq2_f32(svfloat32_t zn, svfloat32_t zm) { + return svzipq2_f32(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_f64 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2f64( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z16test_svzipq2_f64u13__SVFloat64_tu13__SVFloat64_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv2f64( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat64_t test_svzipq2_f64(svfloat64_t zn, svfloat64_t zm) { + return svzipq2_f64(zn, zm); +} + +// CHECK-LABEL: define dso_local @test_svzipq2_bf16 +// CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8bf16( [[ZN]], [[ZM]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z17test_svzipq2_bf16u14__SVBFloat16_tu14__SVBFloat16_t +// CPP-CHECK-SAME: ( [[ZN:%.*]], [[ZM:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.zipq2.nxv8bf16( [[ZN]], [[ZM]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svzipq2_bf16(svbfloat16_t zn, svbfloat16_t zm) { + return svzipq2_bf16(zn, zm); +} + + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx2.c @@ -0,0 +1,577 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svzip_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svzip,_s8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svzip_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svzip,_u8,_x2,)(zn, zm); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svzip_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svzip,_s16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svzip_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svzip,_u16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svzip_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svzip,_f16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svzip_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svzip,_bf16,_x2,)(zn, zm); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svzip_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svzip,_s32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svzip_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svzip,_u32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svzip_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svzip,_f32,_x2,)(zn, zm); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svzip_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svzip,_s64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svzip_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svzip,_u64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svzip_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svzip,_f64,_x2,)(zn, zm); +} + +// 128-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x2u10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svzipq_s8_x2(svint8_t zn, svint8_t zm) { + return SVE_ACLE_FUNC(svzipq,_s8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u8_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x2u11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svzipq_u8_x2(svuint8_t zn, svuint8_t zm) { + return SVE_ACLE_FUNC(svzipq,_u8,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x2u11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svzipq_s16_x2(svint16_t zn, svint16_t zm) { + return SVE_ACLE_FUNC(svzipq,_s16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x2u12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svzipq_u16_x2(svuint16_t zn, svuint16_t zm) { + return SVE_ACLE_FUNC(svzipq,_u16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x2u13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svzipq_f16_x2(svfloat16_t zn, svfloat16_t zm) { + return SVE_ACLE_FUNC(svzipq,_f16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x2u14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svzipq_bf16_x2(svbfloat16_t zn, svbfloat16_t zm) { + return SVE_ACLE_FUNC(svzipq,_bf16,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x2u11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svzipq_s32_x2(svint32_t zn, svint32_t zm) { + return SVE_ACLE_FUNC(svzipq,_s32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x2u12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svzipq_u32_x2(svuint32_t zn, svuint32_t zm) { + return SVE_ACLE_FUNC(svzipq,_u32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f32_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x2u13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svzipq_f32_x2(svfloat32_t zn, svfloat32_t zm) { + return SVE_ACLE_FUNC(svzipq,_f32,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x2u11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svzipq_s64_x2(svint64_t zn, svint64_t zm) { + return SVE_ACLE_FUNC(svzipq,_s64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x2u12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svzipq_u64_x2(svuint64_t zn, svuint64_t zm) { + return SVE_ACLE_FUNC(svzipq,_u64,_x2,)(zn, zm); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f64_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x2u13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svzipq_f64_x2(svfloat64_t zn, svfloat64_t zm) { + return SVE_ACLE_FUNC(svzipq,_f64,_x2,)(zn, zm); +} + diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipx4.c @@ -0,0 +1,768 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 +#endif + +// 8-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svzip_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svzip,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z16test_svzip_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svzip_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svzip,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 16-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svzip_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svzip,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svzip_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svzip,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svzip_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svzip,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzip_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svzip_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svzip,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 32-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svzip_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svzip,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svzip_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svzip,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svzip_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svzip,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 64-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svzip_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svzip,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svzip_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svzip,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzip_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzip_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svzip_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svzip,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} + +// 128-bit ZIPs + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_s8_x4u10__SVInt8_tu10__SVInt8_tu10__SVInt8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svzipq_s8_x4(svint8_t zn1, svint8_t zn2, svint8_t zn3, svint8_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u8_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z17test_svzipq_u8_x4u11__SVUint8_tu11__SVUint8_tu11__SVUint8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svzipq_u8_x4(svuint8_t zn1, svuint8_t zn2, svuint8_t zn3, svuint8_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u8,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s16_x4u11__SVInt16_tu11__SVInt16_tu11__SVInt16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svzipq_s16_x4(svint16_t zn1, svint16_t zn2, svint16_t zn3, svint16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u16_x4u12__SVUint16_tu12__SVUint16_tu12__SVUint16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svzipq_u16_x4(svuint16_t zn1, svuint16_t zn2, svuint16_t zn3, svuint16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f16_x4u13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_tu13__SVFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svzipq_f16_x4(svfloat16_t zn1, svfloat16_t zn2, svfloat16_t zn3, svfloat16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_bf16_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z19test_svzipq_bf16_x4u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svzipq_bf16_x4(svbfloat16_t zn1, svbfloat16_t zn2, svbfloat16_t zn3, svbfloat16_t zn4) { + return SVE_ACLE_FUNC(svzipq,_bf16,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s32_x4u11__SVInt32_tu11__SVInt32_tu11__SVInt32_tu11__SVInt32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svzipq_s32_x4(svint32_t zn1, svint32_t zn2, svint32_t zn3, svint32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u32_x4u12__SVUint32_tu12__SVUint32_tu12__SVUint32_tu12__SVUint32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svzipq_u32_x4(svuint32_t zn1, svuint32_t zn2, svuint32_t zn3, svuint32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f32_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f32_x4u13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_tu13__SVFloat32_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svzipq_f32_x4(svfloat32_t zn1, svfloat32_t zn2, svfloat32_t zn3, svfloat32_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f32,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_s64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_s64_x4u11__SVInt64_tu11__SVInt64_tu11__SVInt64_tu11__SVInt64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svzipq_s64_x4(svint64_t zn1, svint64_t zn2, svint64_t zn3, svint64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_s64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_u64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_u64_x4u12__SVUint64_tu12__SVUint64_tu12__SVUint64_tu12__SVUint64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svzipq_u64_x4(svuint64_t zn1, svuint64_t zn2, svuint64_t zn3, svuint64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_u64,,,_x4)(zn1, zn2, zn3, zn4); +} + +__attribute__((arm_streaming)) +// CHECK-LABEL: @test_svzipq_f64_x4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z18test_svzipq_f64_x4u13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_tu13__SVFloat64_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( [[ZN1:%.*]], [[ZN2:%.*]], [[ZN3:%.*]], [[ZN4:%.*]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svzipq_f64_x4(svfloat64_t zn1, svfloat64_t zn2, svfloat64_t zn3, svfloat64_t zn4) { + return SVE_ACLE_FUNC(svzipq,_f64,,,_x4)(zn1, zn2, zn3, zn4); +} diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c @@ -3,7 +3,6 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=8 -mvscale-max=8 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=16 -mvscale-max=16 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-2048 -// RUN: %clang_cc1 -triple aarch64_32-unknown-darwin -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ILP32 // REQUIRES: aarch64-registered-target diff --git a/clang/test/Headers/arm-neon-header.c b/clang/test/Headers/arm-neon-header.c --- a/clang/test/Headers/arm-neon-header.c +++ b/clang/test/Headers/arm-neon-header.c @@ -2,27 +2,27 @@ // RUN: %clang_cc1 -triple thumbv7-apple-darwin10 -target-cpu cortex-a8 -fsyntax-only -flax-vector-conversions=none -ffreestanding %s // RUN: %clang_cc1 -x c++ -triple thumbv7-apple-darwin10 -target-cpu cortex-a8 -fsyntax-only -Wvector-conversions -ffreestanding %s -// RUN: %clang -fsyntax-only -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c89 --sysroot=%S/Inputs -xc %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c99 --sysroot=%S/Inputs -xc %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c11 --sysroot=%S/Inputs -xc %s +// RUN: %clang -fsyntax-only -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c89 -xc %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c99 -xc %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c11 -xc %s -// RUN: %clang -fsyntax-only -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c89 -xc --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c99 -xc --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c11 -xc --sysroot=%S/Inputs %s +// RUN: %clang -fsyntax-only -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c89 -xc %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c99 -xc %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c11 -xc %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++98 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++11 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++14 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++17 -xc++ --sysroot=%S/Inputs %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++98 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++11 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++14 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64-none-elf -march=armv8.2-a+fp16 -std=c++17 -xc++ %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++98 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++11 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++14 -xc++ --sysroot=%S/Inputs %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++17 -xc++ --sysroot=%S/Inputs %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++98 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++11 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++14 -xc++ %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding -nostdinc++ --target=aarch64_be-none-elf -march=armv8.2-a+fp16 -std=c++17 -xc++ %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc --sysroot=%S/Inputs -flax-vector-conversions=none %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc --sysroot=%S/Inputs -flax-vector-conversions=none %s -// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=arm64-linux-gnu -arch +neon -std=c11 -xc --sysroot=%S/Inputs -flax-vector-conversions=none %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64-none-elf -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc -flax-vector-conversions=none %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=aarch64_be-none-elf -march=armv8.2-a+fp16fml+crypto+dotprod -std=c11 -xc -flax-vector-conversions=none %s +// RUN: %clang -fsyntax-only -Wall -Werror -ffreestanding --target=arm64-linux-gnu -arch +neon -std=c11 -xc -flax-vector-conversions=none %s // REQUIRES: aarch64-registered-target || arm-registered-target diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -18,6 +18,8 @@ // CHECK-NEXT: AnyX86NoCfCheck (SubjectMatchRule_hasType_functionType) // CHECK-NEXT: ArcWeakrefUnavailable (SubjectMatchRule_objc_interface) // CHECK-NEXT: ArmBuiltinAlias (SubjectMatchRule_function) +// CHECK-NEXT: ArmLocallyStreaming (SubjectMatchRule_function) +// CHECK-NEXT: ArmNewZA (SubjectMatchRule_function) // CHECK-NEXT: AssumeAligned (SubjectMatchRule_objc_method, SubjectMatchRule_function) // CHECK-NEXT: Assumption (SubjectMatchRule_function, SubjectMatchRule_objc_method) // CHECK-NEXT: Availability ((SubjectMatchRule_record, SubjectMatchRule_enum, SubjectMatchRule_enum_constant, SubjectMatchRule_field, SubjectMatchRule_function, SubjectMatchRule_namespace, SubjectMatchRule_objc_category, SubjectMatchRule_objc_implementation, SubjectMatchRule_objc_interface, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property, SubjectMatchRule_objc_protocol, SubjectMatchRule_record, SubjectMatchRule_type_alias, SubjectMatchRule_variable)) diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -229,6 +229,10 @@ // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1 +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME-BF16 %s +// CHECK-SME-BF16: __ARM_FEATURE_SME 1 +// CHECK-SME-BF16: __ARM_FEATURE_UNALIGNED 1 + // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.4a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1 diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -0,0 +1,125 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ +// RUN: -target-feature +sme -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include "arm_neon.h" +#include "arm_sme.h" +#include "arm_sve.h" + +__attribute__((arm_streaming)) int16x8_t incompat_neon_sm(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_locally_streaming)) int16x8_t incompat_neon_ls(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_streaming_compatible)) int16x8_t incompat_neon_smc(int16x8_t splat) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33); +} + +__attribute__((arm_shared_za)) void incompat_sme_norm(svbool_t pg, void const *ptr) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + return __builtin_sme_svld1_hor_za128(0, 0, 0, pg, ptr); +} + +__attribute__((arm_streaming_compatible, arm_shared_za)) void incompat_sme_smc(svbool_t pg, void const *ptr) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sme_svld1_hor_za128(0, 0, 0, pg, ptr); +} + +__attribute__((arm_streaming)) +svuint32_t +incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_locally_streaming)) +svuint32_t +incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) +svuint32_t +incompat_sve_smc(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming)) +svuint32_t +incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_locally_streaming)) +svuint32_t +incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming or locally streaming function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) +svuint32_t +incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}} + return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); +} + +__attribute__((arm_shared_za)) +void +incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + svmops_za32_f32_m(0, pn, pm, zn, zm); +} + +__attribute__((arm_streaming)) svfloat64_t +streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_locally_streaming)) svfloat64_t +locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_streaming_compatible)) svfloat64_t +streaming_compatible_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { + // expected-no-warning + return svadd_n_f64_m(pg, a, b); +} + +__attribute__((arm_streaming)) svint16_t +streaming_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_locally_streaming)) svint16_t +locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_streaming_compatible)) svint16_t +streaming_compatible_caller_sve2(svint16_t op1, svint16_t op2) { + // expected-no-warning + return svmul_lane_s16(op1, op2, 0); +} + +__attribute__((arm_streaming)) svbool_t +streaming_caller_ptrue(void) { + // expected-no-warning + return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4)); +} diff --git a/clang/test/Sema/aarch64-incompat-za-builtin-calls.c b/clang/test/Sema/aarch64-incompat-za-builtin-calls.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-incompat-za-builtin-calls.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include "arm_sme.h" + +void requires_za_state(void) { + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + return svzero_mask_za(0); +} + +// No diagnostic expected +__attribute__((arm_new_za)) void with_new_za_state(void) { + return svzero_mask_za(0); +} + +// No diagnostic expected +__attribute__((arm_shared_za)) void with_shared_za_state(void) { + return svzero_mask_za(0); +} + +// FIXME: svzero_za() is defined in arm_sme.h, but is not a builtin. +// Clang should give a similar diagnostic for 'regular' calls to shared_za +// functions when the caller doesn't have ZA state. +__attribute__((arm_shared_za)) void requires_za_state_svzero_no_mask(void) { + return svzero_za(); +} diff --git a/clang/test/Sema/aarch64-sme-attrs-on-x86.c b/clang/test/Sema/aarch64-sme-attrs-on-x86.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-attrs-on-x86.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +extern int normal_callee(); + +// expected-warning@+1 {{unknown attribute 'arm_streaming' ignored}} +__attribute__((arm_streaming)) +int streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_streaming_compatible' ignored}} +__attribute__((arm_streaming_compatible)) +int streaming_compatible_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_locally_streaming' ignored}} +__attribute__((arm_locally_streaming)) +int locally_streaming_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_shared_za' ignored}} +__attribute__((arm_shared_za)) +int shared_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_preserves_za' ignored}} +__attribute__((arm_preserves_za)) +int preserves_za_caller(void) { + return normal_callee(); +} + +// expected-warning@+1 {{unknown attribute 'arm_new_za' ignored}} +__attribute__((arm_new_za)) +int new_za_caller(void) { + return normal_callee(); +} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp @@ -10,8 +10,9 @@ #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 #endif -#include +#include +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_0(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}} SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, -1, 0, pg, ptr); @@ -48,6 +49,7 @@ SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(15, -1, 1, pg, svundef_s8()); } +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_1(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, -1, 0, pg, ptr); @@ -84,6 +86,7 @@ SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(7, -1, 2, pg, svundef_s64()); } +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_3(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, -1, 0, pg, ptr); @@ -138,6 +141,7 @@ SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8()); } +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_7(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, -1, 0, pg, ptr); @@ -197,6 +201,7 @@ SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64()); } +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_15(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, -1, 0, pg, ptr); @@ -238,6 +243,7 @@ SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(0, -1, 16, pg, svundef_s8()); } +__attribute__((arm_streaming, arm_shared_za)) void test_range_0_255(svbool_t pg, void *ptr) { // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}} SVE_ACLE_FUNC(svzero_mask_za,,,)(256); @@ -245,6 +251,7 @@ SVE_ACLE_FUNC(svzero_mask_za,,,)(-1); } +__attribute__((arm_streaming, arm_shared_za)) void test_constant(uint64_t u64, svbool_t pg, void *ptr) { SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, 0, pg, ptr); // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}} SVE_ACLE_FUNC(svld1_ver_za16,,,)(0, u64, u64, pg, ptr); // expected-error {{argument to 'svld1_ver_za16' must be a constant integer}} diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c @@ -3,23 +3,29 @@ // Test that functions with the correct target attributes can use the correct SME intrinsics. -#include - __attribute__((target("sme"))) -void test_sme(svbool_t pg, void *ptr) { - svld1_hor_za8(0, 0, 0, pg, ptr); +void test_sme(__SVBool_t pg, void *ptr) { + // expected-warning@+2 {{builtin call has undefined behaviour when called from a non-streaming function}} + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + __builtin_sme_svld1_hor_za8(0, 0, 0, pg, ptr); } __attribute__((target("arch=armv8-a+sme"))) -void test_arch_sme(svbool_t pg, void *ptr) { - svld1_hor_vnum_za32(0, 0, 0, pg, ptr, 0); +void test_arch_sme(__SVBool_t pg, void *ptr) { + // expected-warning@+2 {{builtin call has undefined behaviour when called from a non-streaming function}} + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + __builtin_sme_svld1_hor_vnum_za32(0, 0, 0, pg, ptr, 0); } __attribute__((target("+sme"))) -void test_plus_sme(svbool_t pg, void *ptr) { - svst1_ver_za16(0, 0, 0, pg, ptr); +void test_plus_sme(__SVBool_t pg, void *ptr) { + // expected-warning@+2 {{builtin call has undefined behaviour when called from a non-streaming function}} + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + __builtin_sme_svst1_ver_za16(0, 0, 0, pg, ptr); } -void undefined(svbool_t pg, void *ptr) { - svst1_ver_vnum_za64(0, 0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}} +void undefined(__SVBool_t pg, void *ptr) { + // expected-warning@+2 {{builtin call has undefined behaviour when called from a non-streaming function}} + // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}} + __builtin_sme_svst1_ver_vnum_za64(0, 0, 0, pg, ptr, 0); // expected-error {{'__builtin_sme_svst1_ver_vnum_za64' needs target feature sme}} } diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -0,0 +1,559 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve2 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +__attribute__((arm_streaming, arm_shared_za)) +void test_multivector_read(uint32_t base) { + + // Test Tile Range + svread_hor_za8_u8_vg2(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_ver_za8_u8_vg2(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_hor_za8_u8_vg4(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svread_ver_za8_u8_vg4(1, base, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + + svread_hor_za16_u16_vg2(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_vg2(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_hor_za16_u16_vg4(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svread_ver_za16_u16_vg4(2, base, 0); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svread_hor_za32_u32_vg2(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_u32_vg2(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_hor_za32_u32_vg4(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svread_ver_za32_u32_vg4(4, base, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + svread_hor_za64_u64_vg2(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za64_u64_vg2(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_hor_za64_u64_vg4(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_ver_za64_u64_vg4(8, base, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Offset Range + svread_hor_za8_u8_vg2(0, base, 13); // expected-error {{argument should be a multiple of 2}} + svread_hor_za8_u8_vg2(0, base, 16); // expected-error {{argument value 16 is outside the valid range [0, 14]}} + svread_ver_za8_u8_vg4(0, base, 11); // expected-error {{argument should be a multiple of 4}} + svread_ver_za8_u8_vg4(0, base, 16); // expected-error {{argument value 16 is outside the valid range [0, 12]}} + + svread_hor_za16_u16_vg2(0, base, 5); // expected-error {{argument should be a multiple of 2}} + svread_hor_za16_u16_vg2(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + svread_ver_za16_u16_vg2(0, base, 5); // expected-error {{argument should be a multiple of 2}} + svread_ver_za16_u16_vg2(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + + svread_hor_za16_u16_vg4(0, base, 3); // expected-error {{argument should be a multiple of 4}} + svread_hor_za16_u16_vg4(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + svread_ver_za16_u16_vg4(0, base, 3); // expected-error {{argument should be a multiple of 4}} + svread_ver_za16_u16_vg4(0, base, 8); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + + svread_hor_za32_u32_vg2(0, base, 1); // expected-error {{argument should be a multiple of 2}} + svread_hor_za32_u32_vg2(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + svread_ver_za32_u32_vg2(0, base, 1); // expected-error {{argument should be a multiple of 2}} + svread_ver_za32_u32_vg2(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + + svread_hor_za32_u32_vg4(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + svread_hor_za64_u64_vg2(0, base, 2); // expected-error {{argument value 2 is outside the valid range [0, 0]}} + svread_hor_za64_u64_vg4(0, base, 4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + + svread_za64_u64_vg1x2(base, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svread_za64_u64_vg1x4(base, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_multivector_write(uint32_t base, svuint8x2_t v8x2, svuint8x4_t v8x4, + svuint16x2_t v16x2, svuint16x4_t v16x4, + svuint32x2_t v32x2, svuint32x4_t v32x4, + svuint64x2_t v64x2, svuint64x4_t v64x4) { + + // Test Tile Range + svwrite_hor_za8_u8_vg2(1, base, 0, v8x2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_ver_za8_u8_vg2(1, base, 0, v8x2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_hor_za8_u8_vg4(1, base, 0, v8x4); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svwrite_ver_za8_u8_vg4(1, base, 0, v8x4); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + + svwrite_hor_za16_u16_vg2(2, base, 0, v16x2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_vg2(2, base, 0, v16x2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_hor_za16_u16_vg4(2, base, 0, v16x4); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svwrite_ver_za16_u16_vg4(2, base, 0, v16x4); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svwrite_hor_za32_u32_vg2(4, base, 0, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_u32_vg2(4, base, 0, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_hor_za32_u32_vg4(4, base, 0, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svwrite_ver_za32_u32_vg4(4, base, 0, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + svwrite_hor_za64_u64_vg2(8, base, 0, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za64_u64_vg2(8, base, 0, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_hor_za64_u64_vg4(8, base, 0, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_ver_za64_u64_vg4(8, base, 0, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Offset Range + svwrite_hor_za8_u8_vg2(0, base, 13, v8x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za8_u8_vg2(0, base, 16, v8x2); // expected-error {{argument value 16 is outside the valid range [0, 14]}} + svwrite_ver_za8_u8_vg4(0, base, 11, v8x4); // expected-error {{argument should be a multiple of 4}} + svwrite_ver_za8_u8_vg4(0, base, 16, v8x4); // expected-error {{argument value 16 is outside the valid range [0, 12]}} + + svwrite_hor_za16_u16_vg2(0, base, 5, v16x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za16_u16_vg2(0, base, 8, v16x2); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + svwrite_ver_za16_u16_vg2(0, base, 5, v16x2); // expected-error {{argument should be a multiple of 2}} + svwrite_ver_za16_u16_vg2(0, base, 8, v16x2); // expected-error {{argument value 8 is outside the valid range [0, 6]}} + + svwrite_hor_za16_u16_vg4(0, base, 3, v16x4); // expected-error {{argument should be a multiple of 4}} + svwrite_hor_za16_u16_vg4(0, base, 8, v16x4); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + svwrite_ver_za16_u16_vg4(0, base, 3, v16x4); // expected-error {{argument should be a multiple of 4}} + svwrite_ver_za16_u16_vg4(0, base, 8, v16x4); // expected-error {{argument value 8 is outside the valid range [0, 4]}} + + svwrite_hor_za32_u32_vg2(0, base, 1, v32x2); // expected-error {{argument should be a multiple of 2}} + svwrite_hor_za32_u32_vg2(0, base, 4, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + svwrite_ver_za32_u32_vg2(0, base, 1, v32x2); // expected-error {{argument should be a multiple of 2}} + svwrite_ver_za32_u32_vg2(0, base, 4, v32x2); // expected-error {{argument value 4 is outside the valid range [0, 2]}} + + svwrite_hor_za32_u32_vg4(0, base, 4, v32x4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + svwrite_hor_za64_u64_vg2(0, base, 2, v64x2); // expected-error {{argument value 2 is outside the valid range [0, 0]}} + svwrite_hor_za64_u64_vg4(0, base, 4, v64x4); // expected-error {{argument value 4 is outside the valid range [0, 0]}} + + svwrite_za64_u64_vg1x2(base, 8, v64x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svwrite_za64_u64_vg1x4(base, 8, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8, + svint16_t s16, svuint16_t u16, svint8x2_t s8x2, + svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2, + svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) { + svmla_single_za32_s8_vg4x1(base, 13, s8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za32_u8_vg4x1(base, 13, u8, u8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za64_s16_vg4x1(base, 13, s16, s16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_single_za64_u16_vg4x1(base, 13, u16, u16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmla_single_za32_s8_vg4x2(base, 5, s8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za32_u8_vg4x2(base, 5, u8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_s16_vg4x2(base, 5, s16x2, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_u16_vg4x2(base, 5, u16x2, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_single_za32_s8_vg4x4(base, 5, s8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za32_u8_vg4x4(base, 5, u8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_s16_vg4x4(base, 5, s16x4, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_single_za64_u16_vg4x4(base, 5, u16x4, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_single_za32_s8_vg4x1(base, 13, s8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za32_u8_vg4x1(base, 13, u8, u8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za64_s16_vg4x1(base, 13, s16, s16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_single_za64_u16_vg4x1(base, 13, u16, u16); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmls_single_za32_s8_vg4x2(base, 5, s8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za32_u8_vg4x2(base, 5, u8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_s16_vg4x2(base, 5, s16x2, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_u16_vg4x2(base, 5, u16x2, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_single_za32_s8_vg4x4(base, 5, s8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za32_u8_vg4x4(base, 5, u8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_s16_vg4x4(base, 5, s16x4, s16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_single_za64_u16_vg4x4(base, 5, u16x4, u16); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svsumla_single_za32_u8_vg4x2(base, 5, s8x2, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svsumla_single_za32_u8_vg4x4(base, 5, s8x4, u8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_single_za32_s8_vg4x1(base, 13, u8, s8); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svusmla_single_za32_s8_vg4x2(base, 5, u8x2, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_single_za32_s8_vg4x4(base, 5, u8x4, s8); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_za32_s8_vg4x2(base, 5, s8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za32_u8_vg4x2(base, 5, u8x2, u8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_s16_vg4x2(base, 5, s16x2, s16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_u16_vg4x2(base, 5, u16x2, u16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_za32_s8_vg4x4(base, 5, s8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za32_u8_vg4x4(base, 5, u8x4, u8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_s16_vg4x4(base, 5, s16x4, s16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_za64_u16_vg4x4(base, 5, u16x4, u16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_za32_s8_vg4x2(base, 5, s8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za32_u8_vg4x2(base, 5, u8x2, u8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_s16_vg4x2(base, 5, s16x2, s16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_u16_vg4x2(base, 5, u16x2, u16x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_za32_s8_vg4x4(base, 5, s8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za32_u8_vg4x4(base, 5, u8x4, u8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_s16_vg4x4(base, 5, s16x4, s16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_za64_u16_vg4x4(base, 5, u16x4, u16x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_za32_s8_vg4x2(base, 5, u8x2, s8x2); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_za32_s8_vg4x4(base, 5, u8x4, s8x4); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x1(base, 13, s8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za32_u8_vg4x1(base, 13, u8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za64_s16_vg4x1(base, 13, s16, s16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmla_lane_za64_u16_vg4x1(base, 13, u16, u16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmla_lane_za32_s8_vg4x2(base, 5, s8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za32_u8_vg4x2(base, 5, u8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_s16_vg4x2(base, 5, s16x2, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_u16_vg4x2(base, 5, u16x2, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x4(base, 5, s8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za32_u8_vg4x4(base, 5, u8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_s16_vg4x4(base, 5, s16x4, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmla_lane_za64_u16_vg4x4(base, 5, u16x4, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmla_lane_za32_s8_vg4x1(base, 12, s8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x1(base, 12, u8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x1(base, 12, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x1(base, 12, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmla_lane_za32_s8_vg4x2(base, 4, s8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x2(base, 4, u8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x2(base, 4, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x2(base, 4, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmla_lane_za32_s8_vg4x4(base, 4, s8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za32_u8_vg4x4(base, 4, u8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmla_lane_za64_s16_vg4x4(base, 4, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmla_lane_za64_u16_vg4x4(base, 4, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x1(base, 13, s8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za32_u8_vg4x1(base, 13, u8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za64_s16_vg4x1(base, 13, s16, s16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svmls_lane_za64_u16_vg4x1(base, 13, u16, u16, 7); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + + svmls_lane_za32_s8_vg4x2(base, 5, s8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za32_u8_vg4x2(base, 5, u8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_s16_vg4x2(base, 5, s16x2, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_u16_vg4x2(base, 5, u16x2, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_lane_za32_s8_vg4x4(base, 5, s8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za32_u8_vg4x4(base, 5, u8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_s16_vg4x4(base, 5, s16x4, s16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svmls_lane_za64_u16_vg4x4(base, 5, u16x4, u16, 7); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svmls_lane_za32_s8_vg4x1(base, 12, s8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x1(base, 12, u8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x1(base, 12, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x1(base, 12, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x2(base, 4, s8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x2(base, 4, u8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x2(base, 4, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x2(base, 4, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svmls_lane_za32_s8_vg4x4(base, 4, s8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za32_u8_vg4x4(base, 4, u8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svmls_lane_za64_s16_vg4x4(base, 4, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_za64_u16_vg4x4(base, 4, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + svsumla_lane_za32_s8_vg4x1(base, 13, s8, u8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svsumla_lane_za32_s8_vg4x2(base, 5, s8x2, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svsumla_lane_za32_s8_vg4x4(base, 5, s8x4, u8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svsumla_lane_za32_s8_vg4x1(base, 12, s8, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svsumla_lane_za32_s8_vg4x2(base, 4, s8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svsumla_lane_za32_s8_vg4x4(base, 4, s8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + svusmla_lane_za32_u8_vg4x1(base, 13, u8, s8, 15); // expected-error {{argument value 13 is outside the valid range [0, 12]}} + svusmla_lane_za32_u8_vg4x2(base, 5, u8x2, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + svusmla_lane_za32_u8_vg4x4(base, 5, u8x4, s8, 15); // expected-error {{argument value 5 is outside the valid range [0, 4]}} + + svusmla_lane_za32_u8_vg4x1(base, 12, u8, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svusmla_lane_za32_u8_vg4x2(base, 4, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svusmla_lane_za32_u8_vg4x4(base, 4, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u16x2, + svint8x4_t s8x4, svuint8x4_t u8x4, + svint16x4_t s16x4, svuint16x4_t u16x4, + svfloat16x2_t f16x2, svbfloat16x2_t bf16x2, + svint16_t s16, svuint16_t u16, + svint8_t s8, svuint8_t u8, + svfloat16_t f16, svbfloat16_t b16) { + // Test slice offset values. + svvdot_lane_za32_s16_vg1x2(base, 8, s16x2, s16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_u16_vg1x2(base, 8, u16x2, u16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_s8_vg1x4(base, 8, s8x4, s8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_u8_vg1x4(base, 8, u8x4, u8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za64_s16_vg1x4(base, 8, s16x4, s16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za64_u16_vg1x4(base, 8, u16x4, u16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_f16_vg1x2(base, 8, f16x2, f16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svvdot_lane_za32_bf16_vg1x2(base, 8, bf16x2, b16, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsuvdot_lane_za32_s8_vg1x4(base, 8, s8x4, s8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusvdot_lane_za32_u8_vg1x4(base, 8, u8x4, u8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test lane indices. + svvdot_lane_za32_s16_vg1x2(base, 7, s16x2, s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_u16_vg1x2(base, 7, u16x2, u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_s8_vg1x4(base, 7, s8x4, s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_u8_vg1x4(base, 7, u8x4, u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za64_s16_vg1x4(base, 7, s16x4, s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svvdot_lane_za64_u16_vg1x4(base, 7, u16x4, u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svvdot_lane_za32_f16_vg1x2(base, 7, f16x2, f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svvdot_lane_za32_bf16_vg1x2(base, 7, bf16x2, b16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svsuvdot_lane_za32_s8_vg1x4(base, 7, s8x4, s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svusvdot_lane_za32_u8_vg1x4(base, 7, u8x4, u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_fdot_za32_bad_slice(uint32_t slice_base, svfloat16_t z_f16, + svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4, + svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2, + svbfloat16x4_t z_bf16x4) { + + // 16-bit float + svdot_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_f16_vg1x2(slice_base, 8, z_f16x2, z_f16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_f16_vg1x4(slice_base, 8, z_f16x4, z_f16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // 16-bit binary float + svdot_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_bf16_vg1x2(slice_base, 8, z_bf16x2, z_bf16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_bf16_vg1x4(slice_base, 8, z_bf16x4, z_bf16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_fdot_za32_bad_lane(uint32_t slice_base, svfloat16_t z_f16, + svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4, + svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2, + svbfloat16x4_t z_bf16x4) { + // 16-bit float + svdot_lane_za32_f16_vg1x2(slice_base, 7, z_f16x2, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_f16_vg1x4(slice_base, 7, z_f16x4, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // 16-bit binary float + svdot_lane_za32_bf16_vg1x2(slice_base, 7, z_bf16x2, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_bf16_vg1x4(slice_base, 7, z_bf16x4, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za)) +void test_svdot_multi_za32_bad_slice(uint32_t slice_base, svuint16_t z_u16, + svuint16x2_t z_u16x2, svuint16x4_t z_u16x4, + svint16_t z_s16, svint16x2_t z_s16x2, + svint16x4_t z_s16x4, svuint8_t z_u8, + svuint8x2_t z_u8x2, svuint8x4_t z_u8x4, + svint8_t z_s8, svint8x2_t z_s8x2, + svint8x4_t z_s8x4) { + // Multi, multi (unsigned) + svdot_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, multi (signed) + svdot_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (unsigned) + svdot_single_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (signed) + svdot_single_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_single_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned) + svdot_lane_za32_u16_vg1x2(slice_base, 8, z_u16x2, z_u16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u16_vg1x4(slice_base, 8, z_u16x4, z_u16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_u16_vg1x2(slice_base, 8, z_u16x2, z_u16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_u16_vg1x4(slice_base, 8, z_u16x4, z_u16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (signed) + svdot_lane_za32_s16_vg1x2(slice_base, 8, z_s16x2, z_s16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s16_vg1x4(slice_base, 8, z_s16x4, z_s16, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_s16_vg1x2(slice_base, 8, z_s16x2, z_s16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svdot_lane_za64_s16_vg1x4(slice_base, 8, z_s16x4, z_s16, 1); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, multi (unsigned by signed) + svusdot_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8x2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (unsigned by signed) + svusdot_single_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_single_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned by signed) + svusdot_lane_za32_u8_vg1x2(slice_base, 8, z_u8x2, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svusdot_lane_za32_u8_vg1x4(slice_base, 8, z_u8x4, z_s8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, single (signed by unsigned) + svsudot_single_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsudot_single_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_u8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Multi, indexed (unsigned by signed) + svsudot_lane_za32_s8_vg1x2(slice_base, 8, z_s8x2, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svsudot_lane_za32_s8_vg1x4(slice_base, 8, z_s8x4, z_u8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + + +__attribute__((arm_streaming, arm_shared_za)) +void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, + svuint16x2_t z_u16x2, svuint16x4_t z_u16x4, + svint16_t z_s16, svint16x2_t z_s16x2, + svint16x4_t z_s16x4, svuint8_t z_u8, + svuint8x2_t z_u8x2, svuint8x4_t z_u8x4, + svint8_t z_s8, svint8x2_t z_s8x2, + svint8x4_t z_s8x4) { + // Multi, indexed (unsigned) + svdot_lane_za32_u16_vg1x2(slice_base, 7, z_u16x2, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u16_vg1x4(slice_base, 7, z_u16x4, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u8_vg1x2(slice_base, 7, z_u8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_u8_vg1x4(slice_base, 7, z_u8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za64_u16_vg1x2(slice_base, 7, z_u16x2, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svdot_lane_za64_u16_vg1x4(slice_base, 7, z_u16x4, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Multi, indexed (signed) + svdot_lane_za32_s16_vg1x2(slice_base, 7, z_s16x2, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s16_vg1x4(slice_base, 7, z_s16x4, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s8_vg1x2(slice_base, 7, z_s8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za32_s8_vg1x4(slice_base, 7, z_s8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_za64_s16_vg1x2(slice_base, 7, z_s16x2, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svdot_lane_za64_s16_vg1x4(slice_base, 7, z_s16x4, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Multi, indexed (unsigned by signed) + svusdot_lane_za32_u8_vg1x2(slice_base, 7, z_u8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svusdot_lane_za32_u8_vg1x4(slice_base, 7, z_u8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Multi, indexed (unsigned by signed) + svsudot_lane_za32_s8_vg1x2(slice_base, 7, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svsudot_lane_za32_s8_vg1x4(slice_base, 7, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming_compatible)) +void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) { + svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +void test_ldr_str_zt(const void *const_base, void *base) { + svldr_zt(1, const_base); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svstr_zt(1, base); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} + +__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) +void test_zero_zt() { + // Test Reg Offset + svzero_zt(1); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8(0, zn_u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + // Test Reg Offset + svluti2_lane_zt_u16(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16(0, zn_u16, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + + // Test Reg Offset + svluti2_lane_zt_u32(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32(0, zn_u32, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u8(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u8(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti4_lane_zt_u16(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16(0, zn_u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti4_lane_zt_u32(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32(0, zn_u32, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt_x2(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti2_lane_zt_u16_x2(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16_x2(0, zn_u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + + // Test Reg Offset + svluti2_lane_zt_u32_x2(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32_x2(0, zn_u32, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt_x2(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u8_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti4_lane_zt_u16_x2(1, zn_u16, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16_x2(0, zn_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti4_lane_zt_u32_x2(1, zn_u32, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32_x2(0, zn_u32, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti2_lane_zt_x4(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti2_lane_zt_u8_x4(1, zn_u8, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u8_x4(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti2_lane_zt_u16_x4(1, zn_u16, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u16_x4(0, zn_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + + // Test Reg Offset + svluti2_lane_zt_u32_x4(1, zn_u32, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti2_lane_zt_u32_x4(0, zn_u32, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) +void test_svluti4_lane_zt_x4(svuint8_t zn_u8, svuint16_t zn_u16, svuint32_t zn_u32) { + // Test Reg Offset + svluti4_lane_zt_u16_x4(1, zn_u16, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u16_x4(0, zn_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + // Test Reg Offset + svluti4_lane_zt_u32_x4(1, zn_u32, 1); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + // Test index value range + svluti4_lane_zt_u32_x4(0, zn_u32, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} diff --git a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c @@ -0,0 +1,39 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target +#include "arm_sve.h" + +//svldnt1: + +__attribute__((target("+sme2"))) +svuint8x2_t sme2_or_sve2p1_intrinsic_test_sme2_invalid(svcount_t png, const uint8_t *rn) { + // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}} + return svldnt1_u8_x2(png, rn); +} + +__attribute__((target("+sme2"))) +__attribute__((arm_streaming)) +svint16x4_t sme2_or_sve2p1_intrinsic_test_sme2(svcount_t png, const int16_t *rn) { + // expected-no-warning + return svldnt1_s16_x4(png, rn); +} + +__attribute__((target("+sve2p1"))) +svuint32x2_t sme2_or_sve2p1_intrinsic_test_sve2p1(svcount_t png, const uint32_t *rn) { + // expected-no-warning + return svldnt1_u32_x2(png, rn); +} + +__attribute__((target("+sme2,+sve2p1"))) +__attribute__((arm_streaming)) +svint64x4_t sme2_or_sve2p1_intrinsic_test_both_arm_streaming(svcount_t png, const int64_t *rn) { + // expected-no-warning + return svldnt1_s64_x4(png, rn); +} + +__attribute__((target("+sme2,+sve2p1"))) +svint64x4_t sme2_or_sve2p1_intrinsic_test_both_no_arm_streaming(svcount_t png, const int64_t *rn) { + // expected-no-warning + return svldnt1_s64_x4(png, rn); +} diff --git a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp @@ -0,0 +1,264 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +sme2 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +__attribute__((arm_streaming)) +void test_svpext_lane_imm_0_3(svcount_t c) { + svpext_lane_c8(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c16(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c32(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpext_lane_c64(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svpext_lane_c8(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c16(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c32(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpext_lane_c64(c, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + +__attribute__((arm_streaming)) +void test_svpext_lane_x2_imm_0_1(svcount_t c) { + svpext_lane_c8_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c16_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c32_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svpext_lane_c64_x2(c, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpext_lane_c8_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c16_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c32_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpext_lane_c64_x2(c, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} + +__attribute__((arm_streaming)) +void test_svpsel_lane_imm() { + svpsel_lane_b8(svptrue_b8(), svptrue_b8(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svpsel_lane_b16(svptrue_b16(), svptrue_b16(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svpsel_lane_b32(svptrue_b32(), svptrue_b32(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpsel_lane_b64(svptrue_b64(), svptrue_b64(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpsel_lane_b8(svptrue_b8(), svptrue_b8(), 0, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svpsel_lane_b16(svptrue_b16(), svptrue_b16(), 0, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svpsel_lane_b32(svptrue_b32(), svptrue_b32(), 0, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpsel_lane_b64(svptrue_b64(), svptrue_b64(), 0, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + + svpsel_lane_c8(svptrue_c8(), svptrue_b8(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svpsel_lane_c16(svptrue_c16(), svptrue_b16(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svpsel_lane_c32(svptrue_c32(), svptrue_b32(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svpsel_lane_c64(svptrue_c64(), svptrue_b64(), 0, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + + svpsel_lane_c8(svptrue_c8(), svptrue_b8(), 0, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + svpsel_lane_c16(svptrue_c16(), svptrue_b16(), 0, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svpsel_lane_c32(svptrue_c32(), svptrue_b32(), 0, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpsel_lane_c64(svptrue_c64(), svptrue_b64(), 0, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} +} + +__attribute__((arm_streaming)) +svcount_t test_svwhile_pn(int64_t op1, int64_t op2) { + svwhilege_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilege_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilegt_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehi_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilehs_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilele_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelo_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilels_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c8(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c16(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c32(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svwhilelt_c64(op1, op2, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + + svwhilege_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilege_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilegt_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehi_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilehs_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilele_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelo_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilels_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c8(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c16(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c32(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} + svwhilelt_c64(op1, op2, 3); // expected-error {{argument should be a multiple of 2}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_x2(svint32x2_t zn_x2, svuint32x2_t zn_ux2) { + svqrshr_s16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svqrshr_u16_x2(zn_ux2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svqrshr_s16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svqrshr_u16_x2(zn_ux2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svqrshr_s8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svqrshr_u8_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svqrshr_s8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svqrshr_u8_x4(zn_ux4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshr_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svqrshr_s16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + svqrshr_u16_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svqrshr_s16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} + svqrshr_u16_x4(zn_ux4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming_compatible)) +void test_svqrshrn_x2(svint32x2_t zn_x2, svuint32x2_t zn_ux2) { + svqrshrn_s16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svqrshrn_u16_x2(zn_ux2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svqrshrn_s16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svqrshrn_u16_x2(zn_ux2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrn_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svqrshrn_s8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svqrshrn_u8_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svqrshrn_s8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svqrshrn_u8_x4(zn_ux4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrn_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svqrshrn_s16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + svqrshrn_u16_x4(zn_ux4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svqrshrn_s16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} + svqrshrn_u16_x4(zn_ux4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming)) +void test_svqrshru(svint32x2_t zn_x2, svint32x4_t zn_32x4, svint64x4_t zn_64x4) { + svsqrshru_u16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + svsqrshru_u8_x4(zn_32x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + svsqrshru_u16_x4(zn_64x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svsqrshru_u16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} + svsqrshru_u8_x4(zn_32x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} + svsqrshru_u16_x4(zn_64x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming_compatible)) +void test_svqrshrun_x2(svint32x2_t zn_x2) { + svsqrshrun_u16_x2(zn_x2, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}} + + svsqrshrun_u16_x2(zn_x2, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrun_32x4(svint32x4_t zn_x4, svuint32x4_t zn_ux4) { + svsqrshrun_u8_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} + + svsqrshrun_u8_x4(zn_x4, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} +} + +__attribute__((arm_streaming)) +void test_svqrshrun_64x4(svint64x4_t zn_x4, svuint64x4_t zn_ux4) { + svsqrshrun_u16_x4(zn_x4, 0); // expected-error {{argument value 0 is outside the valid range [1, 64]}} + + svsqrshrun_u16_x4(zn_x4, 65); // expected-error {{argument value 65 is outside the valid range [1, 64]}} +} + +__attribute__((arm_streaming)) +void test_cntp(svcount_t c) { + svcntp_c8(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c16(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c32(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + svcntp_c64(c, 6); // expected-error {{argument value 6 is outside the valid range [2, 4]}} + + svcntp_c8(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c16(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c32(c, 3); // expected-error {{argument should be a multiple of 2}} + svcntp_c64(c, 3); // expected-error {{argument should be a multiple of 2}} +} + +__attribute__((arm_streaming_compatible)) +void test_svdot_lane_2way(svint32_t s32, svuint32_t u32, svint16_t s16, svuint16_t u16, + svfloat32_t f32, svfloat16_t f16) { + svdot_lane_s32_s16_s16(s32, s16, s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_u32_u16_u16(u32, u16, u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svdot_lane_f32_f16_f16(f32, f16, f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} +} + + +__attribute__((target("+sve2p1+b16b16"))) +void test_svbfml_lane(svbfloat16_t zda, svbfloat16_t zn, svbfloat16_t zm){ + svmla_lane_bf16(zda, zn, zm, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmla_lane_bf16(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + svmls_lane_bf16(zda, zn, zm, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmls_lane_bf16(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((target("+sve2p1"))) +void test_svextq_lane(svint16_t zn_i16, svint16_t zm_i16, svfloat16_t zn_f16, svfloat16_t zm_f16){ + svextq_lane_s16(zn_i16, zm_i16, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}} + svextq_lane_f16(zn_f16, zm_f16, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} +} + +__attribute__((target("+sve2p1"))) +void test_svpmov_lane_to_pred(svint8_t zd_i8, svuint16_t zd_i16, svint32_t zd_i32, svuint64_t zd_i64){ + svpmov_lane_s8(zd_i8, 1); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svpmov_lane_u16(zd_i16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svpmov_lane_s32(zd_i32, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + svpmov_lane_u64(zd_i64, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +__attribute__((target("+sve2p1"))) +void test_svpmov_lane_to_vector(svint16_t zd_i16, svuint32_t zd_i32, svint64_t zd_i64, svbool_t pn){ + svpmov_lane_s16_m(zd_i16, pn, 0); // expected-error {{argument value 0 is outside the valid range [1, 1]}} + svpmov_lane_s16_m(zd_i16, pn, 2); // expected-error {{argument value 2 is outside the valid range [1, 1]}} + svpmov_lane_u32_m(zd_i32, pn, 0); // expected-error {{argument value 0 is outside the valid range [1, 3]}} + svpmov_lane_u32_m(zd_i32, pn, 4); // expected-error {{argument value 4 is outside the valid range [1, 3]}} + svpmov_lane_s64_m(zd_i64, pn, 0); // expected-error {{argument value 0 is outside the valid range [1, 7]}} + svpmov_lane_s64_m(zd_i64, pn, 8); // expected-error {{argument value 8 is outside the valid range [1, 7]}} +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -552,6 +552,8 @@ void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl &Defs); void genOverloadTypeCheckCode(raw_ostream &OS, SmallVectorImpl &Defs); + void genStreamingSVECompatibleList(raw_ostream &OS, + SmallVectorImpl &Defs); void genIntrinsicRangeCheckCode(raw_ostream &OS, SmallVectorImpl &Defs); @@ -2039,6 +2041,30 @@ OS << "#endif\n\n"; } +void NeonEmitter::genStreamingSVECompatibleList( + raw_ostream &OS, SmallVectorImpl &Defs) { + OS << "#ifdef GET_NEON_STREAMING_COMPAT_FLAG\n"; + + std::set Emitted; + for (auto *Def : Defs) { + // If the def has a body (that is, it has Operation DAGs), it won't call + // __builtin_neon_* so we don't need to generate a definition for it. + if (Def->hasBody()) + continue; + + std::string Name = Def->getMangledName(); + if (Emitted.find(Name) != Emitted.end()) + continue; + + // FIXME: We should make exceptions here for some NEON builtins that are + // permitted in streaming mode. + OS << "case NEON::BI__builtin_neon_" << Name + << ": BuiltinType = ArmNonStreaming; break;\n"; + Emitted.insert(Name); + } + OS << "#endif\n\n"; +} + /// Generate the ARM and AArch64 overloaded type checking code for /// SemaChecking.cpp, checking for unique builtin declarations. void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS, @@ -2222,6 +2248,8 @@ // Generate ARM overloaded type checking code for SemaChecking.cpp genOverloadTypeCheckCode(OS, Defs); + genStreamingSVECompatibleList(OS, Defs); + // Generate ARM range checking code for shift/lane immediates. genIntrinsicRangeCheckCode(OS, Defs); } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -43,6 +43,8 @@ ClassG, // Overloaded name without type suffix }; +enum class ACLEKind { SVE, SME }; + using TypeSpec = std::string; namespace { @@ -73,12 +75,12 @@ public: SVEType() : SVEType(TypeSpec(), 'v') {} - SVEType(TypeSpec TS, char CharMod) + SVEType(TypeSpec TS, char CharMod, unsigned NumVectors = 1) : TS(TS), Float(false), Signed(true), Immediate(false), Void(false), Constant(false), Pointer(false), BFloat(false), DefaultType(false), IsScalable(true), Predicate(false), PredicatePattern(false), PrefetchOp(false), Svcount(false), Bitwidth(128), ElementBitwidth(~0U), - NumVectors(1) { + NumVectors(NumVectors) { if (!TS.empty()) applyTypespec(); applyModifier(CharMod); @@ -91,6 +93,7 @@ bool isScalar() const { return NumVectors == 0; } bool isVector() const { return NumVectors > 0; } bool isScalableVector() const { return isVector() && IsScalable; } + bool isFixedLengthVector() const { return isVector() && !IsScalable; } bool isChar() const { return ElementBitwidth == 8; } bool isVoid() const { return Void & !Pointer; } bool isDefault() const { return DefaultType; } @@ -194,7 +197,9 @@ SVEType getReturnType() const { return Types[0]; } ArrayRef getTypes() const { return Types; } SVEType getParamType(unsigned I) const { return Types[I + 1]; } - unsigned getNumParams() const { return Proto.size() - 1; } + unsigned getNumParams() const { + return Proto.size() - (2 * std::count(Proto.begin(), Proto.end(), '.')) - 1; + } uint64_t getFlags() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag;} @@ -228,15 +233,23 @@ /// Return the parameter index of the splat operand. unsigned getSplatIdx() const { - // These prototype modifiers are described in arm_sve.td. - auto Idx = Proto.find_first_of("ajfrKLR@"); - assert(Idx != std::string::npos && Idx > 0 && - "Prototype has no splat operand"); - return Idx - 1; + unsigned I = 1, Param = 0; + for (; I < Proto.size(); ++I, ++Param) { + if (Proto[I] == 'a' || Proto[I] == 'j' || Proto[I] == 'f' || + Proto[I] == 'r' || Proto[I] == 'K' || Proto[I] == 'L' || + Proto[I] == 'R' || Proto[I] == '@') + break; + + // Multivector modifier can be skipped + if (Proto[I] == '.') + I += 2; + } + assert(I != Proto.size() && "Prototype has no splat operand"); + return Param; } /// Emits the intrinsic declaration to the ostream. - void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const; + void emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const; private: std::string getMergeSuffix() const { return MergeSuffix; } @@ -344,14 +357,17 @@ /// Emit arm_sve.h. void createHeader(raw_ostream &o); + // Emits core intrinsics in both arm_sme.h and arm_sve.h + void createCoreHeaderIntrinsics(raw_ostream &o, ACLEKind Kind); + /// Emit all the __builtin prototypes and code needed by Sema. void createBuiltins(raw_ostream &o); /// Emit all the information needed to map builtin -> LLVM IR intrinsic. - void createCodeGenMap(raw_ostream &o); + void createCodeGenMap(raw_ostream &o, ACLEKind Kind); /// Emit all the range checks for the immediates. - void createRangeChecks(raw_ostream &o); + void createRangeChecks(raw_ostream &o, ACLEKind Kind); /// Create the SVETypeFlags used in CGBuiltins void createTypeFlags(raw_ostream &o); @@ -362,12 +378,21 @@ /// Emit all the SME __builtin prototypes and code needed by Sema. void createSMEBuiltins(raw_ostream &o); + /// Create the SMETypeFlags used in CGBuiltins + void createSmeTypeFlags(raw_ostream &o); + /// Emit all the information needed to map builtin -> LLVM IR intrinsic. void createSMECodeGenMap(raw_ostream &o); + /// Create a table for a builtin's requirement for PSTATE.SM. + void createStreamingAttrs(raw_ostream &o, ACLEKind Kind); + /// Emit all the range checks for the immediates. void createSMERangeChecks(raw_ostream &o); + /// Create a table for a builtin's requirement for PSTATE.ZA. + void createBuiltinZAState(raw_ostream &OS); + /// Create intrinsic and add it to \p Out void createIntrinsic(Record *R, SmallVectorImpl> &Out); @@ -439,7 +464,9 @@ return S; } - assert(isScalableVector() && "Unsupported type"); + if (isFixedLengthVector()) + return "V" + utostr(getNumElements() * NumVectors) + S; + return "q" + utostr(getNumElements() * NumVectors) + S; } @@ -451,11 +478,14 @@ return "enum svprfop"; std::string S; + if (Void) S += "void"; else { if (isScalableVector() || isSvcount()) S += "sv"; + if (isFixedLengthVector()) + S += "__sve_"; if (!Signed && !isFloatingPoint()) S += "u"; @@ -472,7 +502,7 @@ if (!isScalarPredicate() && !isPredicateVector() && !isSvcount()) S += utostr(ElementBitwidth); - if (!isScalableVector() && isVector()) + if (isFixedLengthVector()) S += "x" + utostr(getNumElements()); if (NumVectors > 1) S += "x" + utostr(NumVectors); @@ -540,15 +570,6 @@ void SVEType::applyModifier(char Mod) { switch (Mod) { - case '2': - NumVectors = 2; - break; - case '3': - NumVectors = 3; - break; - case '4': - NumVectors = 4; - break; case 'v': Void = true; break; @@ -591,6 +612,11 @@ Bitwidth = 16; ElementBitwidth = 1; break; + case '{': + IsScalable = false; + Bitwidth = 128; + NumVectors = 1; + break; case 's': case 'a': Bitwidth = ElementBitwidth; @@ -756,6 +782,13 @@ Float = true; ElementBitwidth = 64; break; + case '$': + Predicate = false; + Svcount = false; + Float = false; + BFloat = true; + ElementBitwidth = 16; + break; case 'Q': Constant = true; Pointer = true; @@ -859,11 +892,43 @@ Float = false; BFloat = false; break; + case 'y': + Predicate = true; + Svcount = false; + NumVectors = 0; + Float = false; + BFloat = false; + break; + case '.' : + llvm_unreachable(". should never be a type"); + break; default: llvm_unreachable("Unhandled character!"); } } +/// Returns the modifier and number of vectors for the given operand \p Op. +std::pair getProtoModifier(StringRef Proto, unsigned Op) { + for (unsigned P = 0; !Proto.empty(); ++P) { + unsigned NumVectors = 1; + unsigned CharsToSkip = 1; + char Mod = Proto[0]; + if (Mod == '2' || Mod == '3' || Mod == '4') { + NumVectors = Mod - '0'; + Mod = 'd'; + if (Proto.size() > 1 && Proto[1] == '.') { + Mod = Proto[2]; + CharsToSkip = 3; + } + } + + if (P == Op) + return {Mod, NumVectors}; + + Proto = Proto.drop_front(CharsToSkip); + } + llvm_unreachable("Unexpected Op"); +} //===----------------------------------------------------------------------===// // Intrinsic implementation @@ -879,18 +944,21 @@ MergeSuffix(MergeSuffix.str()), BaseType(BT, 'd'), Flags(Flags), ImmChecks(Checks.begin(), Checks.end()) { // Types[0] is the return value. - for (unsigned I = 0; I < Proto.size(); ++I) { - SVEType T(BaseTypeSpec, Proto[I]); + for (unsigned I = 0; I < (getNumParams() + 1); ++I) { + char Mod; + unsigned NumVectors; + std::tie(Mod, NumVectors) = getProtoModifier(Proto, I); + SVEType T(BaseTypeSpec, Mod, NumVectors); Types.push_back(T); // Add range checks for immediates if (I > 0) { if (T.isPredicatePattern()) - ImmChecks.emplace_back( - I - 1, Emitter.getEnumValueForImmCheck("ImmCheck0_31")); + ImmChecks.emplace_back(I - 1, + Emitter.getEnumValueForImmCheck("ImmCheck0_31")); else if (T.isPrefetchOp()) - ImmChecks.emplace_back( - I - 1, Emitter.getEnumValueForImmCheck("ImmCheck0_13")); + ImmChecks.emplace_back(I - 1, + Emitter.getEnumValueForImmCheck("ImmCheck0_13")); } } @@ -987,29 +1055,23 @@ getMergeSuffix(); } -void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const { +void Intrinsic::emitIntrinsic(raw_ostream &OS, ACLEKind Kind) const { bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1; std::string FullName = mangleName(ClassS); std::string ProtoName = mangleName(getClassKind()); - std::string SMEAttrs = ""; - - if (Flags & Emitter.getEnumValueForFlag("IsStreaming")) - SMEAttrs += ", arm_streaming"; - if (Flags & Emitter.getEnumValueForFlag("IsStreamingCompatible")) - SMEAttrs += ", arm_streaming_compatible"; - if (Flags & Emitter.getEnumValueForFlag("IsSharedZA")) - SMEAttrs += ", arm_shared_za"; - if (Flags & Emitter.getEnumValueForFlag("IsPreservesZA")) - SMEAttrs += ", arm_preserves_za"; OS << (IsOverloaded ? "__aio " : "__ai ") - << "__attribute__((__clang_arm_builtin_alias(" - << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_") - << FullName << ")"; - if (!SMEAttrs.empty()) - OS << SMEAttrs; - OS << "))\n"; + << "__attribute__((__clang_arm_builtin_alias("; + + switch (Kind) { + case ACLEKind::SME: + OS << "__builtin_sme_" << FullName << ")))\n"; + break; + case ACLEKind::SVE: + OS << "__builtin_sve_" << FullName << ")))\n"; + break; + } OS << getTypes()[0].str() << " " << ProtoName << "("; for (unsigned I = 0; I < getTypes().size() - 1; ++I) { @@ -1124,10 +1186,12 @@ assert(Arg >= 0 && Kind >= 0 && "Arg and Kind must be nonnegative"); unsigned ElementSizeInBits = 0; + char Mod; + unsigned NumVectors; + std::tie(Mod, NumVectors) = getProtoModifier(Proto, EltSizeArg + 1); if (EltSizeArg >= 0) - ElementSizeInBits = - SVEType(TS, Proto[EltSizeArg + /* offset by return arg */ 1]) - .getElementSizeInBits(); + ElementSizeInBits = SVEType(TS, Mod, NumVectors).getElementSizeInBits(); + ImmChecks.push_back(ImmCheck(Arg, Kind, ElementSizeInBits)); } @@ -1143,6 +1207,31 @@ } } +void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS, ACLEKind Kind) { + SmallVector, 128> Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs); + + // Sort intrinsics in header file by following order/priority: + // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) + // - Class (is ntrinsic overloaded or not) + // - Intrinsic name + std::stable_sort(Defs.begin(), Defs.end(), + [](const std::unique_ptr &A, + const std::unique_ptr &B) { + auto ToTuple = [](const std::unique_ptr &I) { + return std::make_tuple(I->getGuard(), + (unsigned)I->getClassKind(), + I->getName()); + }; + return ToTuple(A) < ToTuple(B); + }); + + for (auto &I : Defs) + I->emitIntrinsic(OS, Kind); +} + void SVEEmitter::createHeader(raw_ostream &OS) { OS << "/*===---- arm_sve.h - ARM SVE intrinsics " "-----------------------------------===\n" @@ -1187,7 +1276,19 @@ OS << "typedef __SVBFloat16_t svbfloat16_t;\n"; - OS << "#include \n"; + OS << "#include \n\n"; + OS << "typedef __attribute__((vector_size (16))) int8_t __sve_int8x16_t;\n"; + OS << "typedef __attribute__((vector_size (16))) int16_t __sve_int16x8_t;\n"; + OS << "typedef __attribute__((vector_size (16))) int32_t __sve_int32x4_t;\n"; + OS << "typedef __attribute__((vector_size (16))) int64_t __sve_int64x2_t;\n"; + OS << "typedef __attribute__((vector_size (16))) uint8_t __sve_uint8x16_t;\n"; + OS << "typedef __attribute__((vector_size (16))) uint16_t __sve_uint16x8_t;\n"; + OS << "typedef __attribute__((vector_size (16))) uint32_t __sve_uint32x4_t;\n"; + OS << "typedef __attribute__((vector_size (16))) uint64_t __sve_uint64x2_t;\n"; + OS << "typedef __attribute__((vector_size (16))) float16_t __sve_float16x8_t;\n"; + OS << "typedef __attribute__((vector_size (16))) float32_t __sve_float32x4_t;\n"; + OS << "typedef __attribute__((vector_size (16))) float64_t __sve_float64x2_t;\n"; + OS << "typedef __attribute__((vector_size (16))) bfloat16_t __sve_bfloat16x8;\n"; OS << "typedef __SVFloat32_t svfloat32_t;\n"; OS << "typedef __SVFloat64_t svfloat64_t;\n"; @@ -1276,6 +1377,17 @@ "__nodebug__))\n\n"; OS << "#define __aio static __inline__ __attribute__((__always_inline__, " "__nodebug__, __overloadable__))\n\n"; + OS << "#ifdef __ARM_FEATURE_SME\n"; + OS << "#define __asc __attribute__((arm_streaming_compatible))\n"; + OS << "#else\n"; + OS << "#define __asc\n"; + OS << "#endif\n\n"; + + OS << "#if defined(__ARM_FEATURE_SME2)\n"; + OS << "#define __as __attribute__((arm_streaming))\n"; + OS << "#else\n"; + OS << "#define __as\n"; + OS << "#endif\n\n"; // Add reinterpret functions. for (auto ShortForm : { false, true } ) @@ -1284,7 +1396,7 @@ if (ShortForm) { OS << "__aio __attribute__((target(\"sve\"))) " << From.Type << " svreinterpret_" << From.Suffix; - OS << "(" << To.Type << " op) {\n"; + OS << "(" << To.Type << " op) __arm_streaming_compatible {\n"; OS << " return __builtin_sve_reinterpret_" << From.Suffix << "_" << To.Suffix << "(op);\n"; OS << "}\n\n"; @@ -1294,27 +1406,7 @@ << To.Suffix << "(__VA_ARGS__)\n"; } - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority: - // - Architectural guard (i.e. does it require SVE2 or SVE2_AES) - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort( - Defs.begin(), Defs.end(), [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declarations. - for (auto &I : Defs) - I->emitIntrinsic(OS, *this); + createCoreHeaderIntrinsics(OS, ACLEKind::SVE); OS << "#define svcvtnt_bf16_x svcvtnt_bf16_m\n"; OS << "#define svcvtnt_bf16_f32_x svcvtnt_bf16_f32_m\n"; @@ -1367,7 +1459,7 @@ OS << "#endif\n\n"; } -void SVEEmitter::createCodeGenMap(raw_ostream &OS) { +void SVEEmitter::createCodeGenMap(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; for (auto *R : RV) @@ -1379,7 +1471,15 @@ return A->getMangledName() < B->getMangledName(); }); - OS << "#ifdef GET_SVE_LLVM_INTRINSIC_MAP\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_LLVM_INTRINSIC_MAP\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_LLVM_INTRINSIC_MAP\n"; + break; + } + for (auto &Def : Defs) { // Builtins only exist for non-overloaded intrinsics, overloaded // declarations only live in the header file. @@ -1391,16 +1491,24 @@ std::string LLVMName = Def->getMangledLLVMName(); std::string Builtin = Def->getMangledName(); + switch (Kind) { + case ACLEKind::SME: + OS << "SME"; + break; + case ACLEKind::SVE: + OS << "SVE"; + break; + } if (!LLVMName.empty()) - OS << "SVEMAP1(" << Builtin << ", " << LLVMName << ", " << FlagString + OS << "MAP1(" << Builtin << ", " << LLVMName << ", " << FlagString << "),\n"; else - OS << "SVEMAP2(" << Builtin << ", " << FlagString << "),\n"; + OS << "MAP2(" << Builtin << ", " << FlagString << "),\n"; } OS << "#endif\n\n"; } -void SVEEmitter::createRangeChecks(raw_ostream &OS) { +void SVEEmitter::createRangeChecks(raw_ostream &OS, ACLEKind Kind) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; for (auto *R : RV) @@ -1412,8 +1520,14 @@ return A->getMangledName() < B->getMangledName(); }); - - OS << "#ifdef GET_SVE_IMMEDIATE_CHECK\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_IMMEDIATE_CHECK\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_IMMEDIATE_CHECK\n"; + break; + } // Ensure these are only emitted once. std::set Emitted; @@ -1423,7 +1537,15 @@ Def->getImmChecks().empty()) continue; - OS << "case SVE::BI__builtin_sve_" << Def->getMangledName() << ":\n"; + switch (Kind) { + case ACLEKind::SME: + OS << "case SME::BI__builtin_sme_"; + break; + case ACLEKind::SVE: + OS << "case SVE::BI__builtin_sve_"; + break; + } + OS << Def->getMangledName() << ":\n"; for (auto &Check : Def->getImmChecks()) OS << "ImmChecks.push_back(std::make_tuple(" << Check.getArg() << ", " << Check.getKind() << ", " << Check.getElementSizeInBits() << "));\n"; @@ -1464,7 +1586,7 @@ } void SVEEmitter::createSMEHeader(raw_ostream &OS) { - OS << "/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics " + OS << "/*===---- arm_sme.h - ARM SME intrinsics " "------===\n" " *\n" " *\n" @@ -1481,45 +1603,49 @@ OS << "#define __ARM_SME_H\n\n"; OS << "#if !defined(__LITTLE_ENDIAN__)\n"; - OS << "#error \"Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h\"\n"; + OS << "#error \"Big endian is currently not supported for arm_sme.h\"\n"; OS << "#endif\n"; - OS << "#include \n\n"; + OS << "#include \n\n"; + OS << "#include \n\n"; + OS << "#ifdef __cplusplus\n"; + OS << "extern \"C\" {\n"; + OS << "#else\n"; + OS << "#include \n"; + OS << "#endif\n\n"; + + OS << "#include \"arm_sve.h\"\n\n"; OS << "/* Function attributes */\n"; OS << "#define __ai static __inline__ __attribute__((__always_inline__, " "__nodebug__))\n\n"; OS << "#define __aio static __inline__ __attribute__((__always_inline__, " "__nodebug__, __overloadable__))\n\n"; - - OS << "#ifdef __cplusplus\n"; - OS << "extern \"C\" {\n"; - OS << "#endif\n\n"; - - SmallVector, 128> Defs; - std::vector RV = Records.getAllDerivedDefinitions("Inst"); - for (auto *R : RV) - createIntrinsic(R, Defs); - - // Sort intrinsics in header file by following order/priority similar to SVE: - // - Architectural guard - // - Class (is intrinsic overloaded or not) - // - Intrinsic name - std::stable_sort(Defs.begin(), Defs.end(), - [](const std::unique_ptr &A, - const std::unique_ptr &B) { - auto ToTuple = [](const std::unique_ptr &I) { - return std::make_tuple(I->getGuard(), - (unsigned)I->getClassKind(), - I->getName()); - }; - return ToTuple(A) < ToTuple(B); - }); - - // Actually emit the intrinsic declaration. - for (auto &I : Defs) { - I->emitIntrinsic(OS, *this); - } + OS << "#define __as __attribute__((arm_streaming))\n\n"; + OS << "#define __asc __attribute__((arm_streaming_compatible))\n\n"; + OS << "#define __asza __attribute__((arm_shared_za))\n\n"; + OS << "#define __apza __attribute__((arm_preserves_za))\n\n"; + + OS << "__asc void __arm_za_disable(void);\n"; + OS << "__ai __asc bool __arm_has_sme(void) {\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & (1ULL << 63);\n"; + OS << "}\n"; + + OS << "__ai __asc bool __arm_in_streaming_mode(void) {\n"; + OS << " uint64_t x0, x1;\n"; + OS << " __builtin_arm_get_sme_state(&x0, &x1);\n"; + OS << " return x0 & 1;\n"; + OS << "}\n"; + OS << "__ai __asc __asza void svundef_za(void) { }\n\n"; + + createCoreHeaderIntrinsics(OS, ACLEKind::SME); + + OS << "__asc __apza void *__arm_sc_memcpy(void *dest, const void *src, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memmove(void *dest, const void *src, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memset(void *s, int c, size_t n);\n"; + OS << "__asc __apza void *__arm_sc_memchr(void *s, int c, size_t n);\n\n"; OS << "#ifdef __cplusplus\n"; OS << "} // extern \"C\"\n"; @@ -1624,6 +1750,107 @@ OS << "#endif\n\n"; } +/// Create the SMETypeFlags used in CGBuiltins +void SVEEmitter::createSmeTypeFlags(raw_ostream &OS) { + OS << "#ifdef LLVM_GET_SME_IMMCHECKTYPES\n"; + for (auto &KV : ImmCheckTypes) + OS << " " << KV.getKey() << " = " << KV.getValue() << ",\n"; + OS << "#endif\n\n"; +} + +void SVEEmitter::createBuiltinZAState(raw_ostream &OS) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + // The mappings must be sorted based on BuiltinID. + llvm::sort(Defs, [](const std::unique_ptr &A, + const std::unique_ptr &B) { + return A->getMangledName() < B->getMangledName(); + }); + + OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n"; + + // Ensure these are only emitted once. + std::set Emitted; + + uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA"); + for (auto &Def : Defs) { + if (Emitted.find(Def->getMangledName()) != Emitted.end()) + continue; + + OS << "case SME::BI__builtin_sme_" << Def->getMangledName() << ":\n"; + if (Def->isFlagSet(IsSharedZAFlag)) + OS << " return true;\n"; + else + OS << " return false;\n"; + + Emitted.insert(Def->getMangledName()); + } + + OS << "#endif\n\n"; +} + +void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) { + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + SmallVector, 128> Defs; + for (auto *R : RV) + createIntrinsic(R, Defs); + + // The mappings must be sorted based on BuiltinID. + llvm::sort(Defs, [](const std::unique_ptr &A, + const std::unique_ptr &B) { + return A->getMangledName() < B->getMangledName(); + }); + + switch (Kind) { + case ACLEKind::SME: + OS << "#ifdef GET_SME_STREAMING_ATTRS\n"; + break; + case ACLEKind::SVE: + OS << "#ifdef GET_SVE_STREAMING_ATTRS\n"; + break; + } + + // Ensure these are only emitted once. + std::set Emitted; + + uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming"); + uint64_t IsStreamingCompatibleFlag = + getEnumValueForFlag("IsStreamingCompatible"); + uint64_t IsStreamingOrSVE2p1 = + getEnumValueForFlag("IsStreamingOrSVE2p1"); + for (auto &Def : Defs) { + if (Emitted.find(Def->getMangledName()) != Emitted.end()) + continue; + + switch (Kind) { + case ACLEKind::SME: + OS << "case SME::BI__builtin_sme_"; + break; + case ACLEKind::SVE: + OS << "case SVE::BI__builtin_sve_"; + break; + } + OS << Def->getMangledName() << ":\n"; + + if (Def->isFlagSet(IsStreamingFlag)) + OS << " BuiltinType = ArmStreaming;\n"; + else if (Def->isFlagSet(IsStreamingCompatibleFlag)) + OS << " BuiltinType = ArmStreamingCompatible;\n"; + else if (Def->isFlagSet(IsStreamingOrSVE2p1)) + OS << " BuiltinType = ArmStreamingOrSVE2p1;\n"; + else + OS << " BuiltinType = ArmNonStreaming;\n"; + OS << " break;\n"; + + Emitted.insert(Def->getMangledName()); + } + + OS << "#endif\n\n"; +} + namespace clang { void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createHeader(OS); @@ -1634,30 +1861,49 @@ } void EmitSveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { - SVEEmitter(Records).createCodeGenMap(OS); + SVEEmitter(Records).createCodeGenMap(OS, ACLEKind::SVE); } void EmitSveRangeChecks(RecordKeeper &Records, raw_ostream &OS) { - SVEEmitter(Records).createRangeChecks(OS); + SVEEmitter(Records).createRangeChecks(OS, ACLEKind::SVE); } void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createTypeFlags(OS); } +void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE); +} + void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createSMEHeader(OS); } void EmitSmeBuiltins(RecordKeeper &Records, raw_ostream &OS) { + // FIXME: do we want to reuse createBuiltins? SVEEmitter(Records).createSMEBuiltins(OS); } void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { + // FIXME: do we want to reuse createCodeGenMap? SVEEmitter(Records).createSMECodeGenMap(OS); } void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createSMERangeChecks(OS); } + +void EmitSmeTypeFlags(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createSmeTypeFlags(OS); +} + +void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME); +} + +void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createBuiltinZAState(OS); +} + } // End namespace clang diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -83,10 +83,14 @@ GenArmSveBuiltinCG, GenArmSveTypeFlags, GenArmSveRangeChecks, + GenArmSveStreamingAttrs, + GenArmSmeBuiltinZAState, GenArmSmeHeader, GenArmSmeBuiltins, GenArmSmeBuiltinCG, + GenArmSmeTypeFlags, GenArmSmeRangeChecks, + GenArmSmeStreamingAttrs, GenArmCdeHeader, GenArmCdeBuiltinDef, GenArmCdeBuiltinSema, @@ -233,14 +237,22 @@ "Generate arm_sve_typeflags.inc for clang"), clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks", "Generate arm_sve_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSveStreamingAttrs, "gen-arm-sve-streaming-attrs", + "Generate arm_sve_streaming_attrs.inc for clang"), clEnumValN(GenArmSmeHeader, "gen-arm-sme-header", "Generate arm_sme.h for clang"), clEnumValN(GenArmSmeBuiltins, "gen-arm-sme-builtins", "Generate arm_sme_builtins.inc for clang"), clEnumValN(GenArmSmeBuiltinCG, "gen-arm-sme-builtin-codegen", "Generate arm_sme_builtin_cg_map.inc for clang"), + clEnumValN(GenArmSmeTypeFlags, "gen-arm-sme-typeflags", + "Generate arm_sme_typeflags.inc for clang"), clEnumValN(GenArmSmeRangeChecks, "gen-arm-sme-sema-rangechecks", "Generate arm_sme_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs", + "Generate arm_sme_streaming_attrs.inc for clang"), + clEnumValN(GenArmSmeBuiltinZAState, "gen-arm-sme-builtin-za-state", + "Generate arm_sme_builtins_za_state.inc for clang"), clEnumValN(GenArmMveHeader, "gen-arm-mve-header", "Generate arm_mve.h for clang"), clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", @@ -472,6 +484,9 @@ case GenArmSveRangeChecks: EmitSveRangeChecks(Records, OS); break; + case GenArmSveStreamingAttrs: + EmitSveStreamingAttrs(Records, OS); + break; case GenArmSmeHeader: EmitSmeHeader(Records, OS); break; @@ -481,9 +496,18 @@ case GenArmSmeBuiltinCG: EmitSmeBuiltinCG(Records, OS); break; + case GenArmSmeTypeFlags: + EmitSmeTypeFlags(Records, OS); + break; case GenArmSmeRangeChecks: EmitSmeRangeChecks(Records, OS); break; + case GenArmSmeStreamingAttrs: + EmitSmeStreamingAttrs(Records, OS); + break; + case GenArmSmeBuiltinZAState: + EmitSmeBuiltinZAState(Records, OS); + break; case GenArmCdeHeader: EmitCdeHeader(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -102,6 +102,15 @@ void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); + +void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1009,11 +1009,13 @@ } /// Create a call to the vector.extract intrinsic. - CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, - const Twine &Name = "") { + Value *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, + const Twine &Name = "") { + if (cast(Idx)->isZero() && DstType == SrcVec->getType()) + return SrcVec; return CreateIntrinsic(Intrinsic::vector_extract, - {DstType, SrcVec->getType()}, {SrcVec, Idx}, nullptr, - Name); + {DstType, SrcVec->getType()}, {SrcVec, Idx}, + nullptr, Name); } /// Create a call to the vector.insert intrinsic. diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1401,6 +1401,12 @@ llvm_anyvector_ty], [IntrNoMem]>; +class AdvSIMD_SVE_V128_Reduce_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, + llvm_anyvector_ty], + [IntrNoMem]>; + class AdvSIMD_SVE_SADDV_Reduce_Intrinsic : DefaultAttrsIntrinsic<[llvm_i64_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1685,6 +1691,13 @@ def int_aarch64_sve_sqsub_x : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_sve_uqadd_x : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_sve_uqsub_x : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_orqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_eorqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_andqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_smaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_umaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_sminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_uminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; // Shifts @@ -1996,6 +2009,11 @@ def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_Reduce_Intrinsic; def int_aarch64_sve_fminv : AdvSIMD_SVE_Reduce_Intrinsic; def int_aarch64_sve_fminnmv : AdvSIMD_SVE_Reduce_Intrinsic; +def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_fmaxnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_fminnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_fmaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; +def int_aarch64_sve_fminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; // // Floating-point conversions @@ -2132,6 +2150,8 @@ def int_aarch64_sve_ld1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; +// 128-bit loads, unscaled offsets +def int_aarch64_sve_ld1q_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; // // First-faulting gather loads: scalar base + vector offsets @@ -2208,6 +2228,9 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; +// 128-bit stores, unscaled offsets +def int_aarch64_sve_st1q_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; + // // Non-temporal scatter stores: scalar base + vector offsets // @@ -2614,6 +2637,26 @@ def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic; def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; + +def int_aarch64_sve_ld2q_sret : AdvSIMD_2Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld3q_sret : AdvSIMD_3Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ld4q_sret : AdvSIMD_4Vec_PredLoad_Intrinsic; + +def int_aarch64_sve_st2q : AdvSIMD_2Vec_PredStore_Intrinsic; +def int_aarch64_sve_st3q : AdvSIMD_3Vec_PredStore_Intrinsic; +def int_aarch64_sve_st4q : AdvSIMD_4Vec_PredStore_Intrinsic; + +// +// SVE2.1 - Contiguous loads to quadword (single vector) +// + +class SVE2p1_Single_Load_Quadword + :DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_nxv1i1_ty, llvm_ptr_ty], + [IntrReadMem]>; +def int_aarch64_sve_ld1uwq : SVE2p1_Single_Load_Quadword; +def int_aarch64_sve_ld1udq : SVE2p1_Single_Load_Quadword; + // // SVE2.1 - Contiguous stores to multiple consecutive vectors // @@ -2633,7 +2676,56 @@ def int_aarch64_sve_st1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; def int_aarch64_sve_stnt1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic; def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; -} + +// +// SVE2.1 - Contiguous store from quadword (single vector) +// + +class SVE2p1_Single_Store_Quadword + : DefaultAttrsIntrinsic<[], + [llvm_anyvector_ty, + llvm_nxv1i1_ty, llvm_ptr_ty], + [IntrArgMemOnly]>; +def int_aarch64_sve_st1uwq : SVE2p1_Single_Store_Quadword; +def int_aarch64_sve_st1udq : SVE2p1_Single_Store_Quadword; + +// +// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 +// +def int_aarch64_sve_zipq1 : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_zipq2 : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_uzpq1 : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_uzpq2 : AdvSIMD_2VectorArg_Intrinsic; + +// +// SVE2.1 - Programmable table lookup within each quadword vector segment (zeroing)/(merging) +// +def int_aarch64_sve_tblq : AdvSIMD_SVE_TBL_Intrinsic; +def int_aarch64_sve_tbxq : AdvSIMD_SVE2_TBX_Intrinsic; + +// +// SVE2.1 - Extract vector segment from each pair of quadword segments. +// +def int_aarch64_sve_extq_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; + +// +// SVE2.1 - Move predicate to/from vector +// +def int_aarch64_sve_pmov_to_pred_lane : + DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyvector_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_sve_pmov_to_vector_lane_merging : + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + +def int_aarch64_sve_pmov_to_vector_lane_zeroing : + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrNoMem]>; // // SVE2 - Contiguous conflict detection @@ -2647,6 +2739,7 @@ def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic; def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic; def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic; +} // Scalable Matrix Extension (SME) Intrinsics let TargetPrefix = "aarch64" in { @@ -2679,10 +2772,11 @@ def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic; // Spill + fill - def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; - def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + class SME_LDR_STR_Intrinsic + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>; + + def int_aarch64_sme_ldr : SME_LDR_STR_Intrinsic; + def int_aarch64_sme_str : SME_LDR_STR_Intrinsic; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -2750,6 +2844,8 @@ def int_aarch64_sme_cntsw : AdvSIMD_SME_CNTSB_Intrinsic; def int_aarch64_sme_cntsd : AdvSIMD_SME_CNTSB_Intrinsic; + def int_aarch64_sme_get_live_za_slices : AdvSIMD_SME_CNTSB_Intrinsic; + // // PSTATE Functions // @@ -2761,6 +2857,10 @@ : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrNoMem, IntrHasSideEffects]>; + def int_aarch64_sme_invoke_resume_pstatesm + : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrHasSideEffects]>; + + // def int_aarch64_sme_za_enable : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; def int_aarch64_sme_za_disable @@ -3011,6 +3111,16 @@ [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SME2_BFMLS_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty], + [IntrNoMem]>; + + class SME2_BFMLS_Lane_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + class SME2_ZA_ArrayVector_Read_VG2_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty], @@ -3214,6 +3324,11 @@ def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic; + def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic; + + def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic; + def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic; // Multi-vector signed saturating doubling multiply high def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic; @@ -3233,7 +3348,7 @@ // Multi-vector min/max // - foreach ty = ["f", "s", "u"] in { + foreach ty = ["bf", "f", "s", "u"] in { foreach instr = ["max", "min"] in { def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic; def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic; @@ -3259,9 +3374,7 @@ // Multi-vector vertical dot-products // - def int_aarch64_sme_fvdot_lane_za32_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; - - foreach ty = ["s", "u"] in { + foreach ty = ["f", "s", "u"] in { def int_aarch64_sme_ #ty # vdot_lane_za32_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_ #ty # vdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; def int_aarch64_sme_ #ty # vdot_lane_za64_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; @@ -3438,4 +3551,48 @@ def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic; def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; + // + // Spill and fill of ZT0 + // + + def int_aarch64_sme_ldr_zt : SME_LDR_STR_Intrinsic; + def int_aarch64_sme_str_zt : SME_LDR_STR_Intrinsic; + + // + // Zero ZT0 + // + + def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>, IntrWriteMem]>; + + // + // Lookup table expand one register + // + def int_aarch64_sme_luti2_lane_zt + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + + // + // Lookup table expand two registers + // + def int_aarch64_sme_luti2_lane_zt_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + + // + // Lookup table expand four registers + // + def int_aarch64_sme_luti2_lane_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; + def int_aarch64_sme_luti4_lane_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrReadMem]>; } diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -53,6 +53,7 @@ FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64BranchTargetsPass(); FunctionPass *createAArch64MIPeepholeOptPass(); +FunctionPass *createSMEPeepholeOptPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -88,6 +89,8 @@ void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); +void initializeSMEPeepholeOptPass(PassRegistry &); +void initializeAArch64SIMDInstrOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); void initializeAArch64PostLegalizerLoweringPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -109,6 +109,9 @@ def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>; +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension (FEAT_BF16)" >; + def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension (FEAT_SPE)">; @@ -169,7 +172,7 @@ "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>; def FeatureSVE2p1: SubtargetFeature<"sve2p1", "HasSVE2p1", "true", - "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>; + "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2, FeatureBF16]>; def FeatureB16B16 : SubtargetFeature<"b16b16", "HasB16B16", "true", "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", []>; @@ -438,9 +441,6 @@ "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; -def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", - "true", "Enable BFloat16 Extension (FEAT_BF16)" >; - def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", "true", "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1063,10 +1063,14 @@ } void AArch64AsmPrinter::emitFunctionEntryLabel() { + SMEAttrs Attrs(MF->getFunction()); if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall || MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall || - MF->getInfo()->isSVECC()) { + MF->getInfo()->isSVECC() || + Attrs.hasStreamingInterface() || + Attrs.hasStreamingCompatibleInterface() || + Attrs.hasSharedZAInterface()) { auto *TS = static_cast(OutStreamer->getTargetStreamer()); TS->emitDirectiveVariantPCS(CurrentFnSym); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1515,17 +1515,12 @@ NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated. return true; } - case AArch64::OBSCURE_COPY: { - if (MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) - .add(MI.getOperand(0)) - .addReg(AArch64::XZR) - .add(MI.getOperand(1)) - .addImm(0); - } + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: MI.eraseFromParent(); return true; - } case AArch64::LD1B_2Z_IMM_PSEUDO: return expandMultiVecPseudo( MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, @@ -1654,6 +1649,38 @@ return expandMultiVecPseudo( MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED); + case AArch64::LUTI2_2ZTZI_B_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_S_2ZTZI_B); + case AArch64::LUTI2_2ZTZI_H_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LUTI2_2ZTZI_H, AArch64::LUTI2_S_2ZTZI_H); + case AArch64::LUTI2_4ZTZI_B_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_S_4ZTZI_B); + case AArch64::LUTI2_4ZTZI_H_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LUTI2_4ZTZI_H, AArch64::LUTI2_S_4ZTZI_H); + case AArch64::LUTI4_2ZTZI_B_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_S_2ZTZI_B); + case AArch64::LUTI4_2ZTZI_H_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LUTI4_2ZTZI_H, AArch64::LUTI4_S_2ZTZI_H); + case AArch64::LUTI4_4ZTZI_H_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_S_4ZTZI_H); + case AArch64::LUTI4_4ZTZI_S_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LUTI4_4ZTZI_S, AArch64::LUTI4_S_4ZTZI_S); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -5186,9 +5186,12 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) { + const AArch64Subtarget *Subtarget = + &FuncInfo.MF->getSubtarget(); SMEAttrs CallerAttrs(*FuncInfo.Fn); - if (CallerAttrs.hasZAState() || - (!CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody())) + if (CallerAttrs.hasZAState() || CallerAttrs.hasZTState() || + CallerAttrs.hasStreamingInterfaceOrBody() || + CallerAttrs.hasStreamingCompatibleInterface()) return nullptr; return new AArch64FastISel(FuncInfo, LibInfo); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -427,6 +427,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + // Win64 EH requires a frame pointer if funclets are present, as the locals // are accessed off the frame pointer in both the parent function and the // funclets. @@ -3254,7 +3255,12 @@ bool AArch64FrameLowering::enableStackSlotScavenging( const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); - return AFI->hasCalleeSaveStackFreeSpace(); + return AFI->hasCalleeSaveStackFreeSpace() && + // If the function has streaming-mode changes, don't scavenge a + // spillslot in the callee-save area, as that might require an + // 'addvl' in the streaming-mode-changing call-sequence when the + // function doesn't use a FP. + !AFI->hasStreamingModeChanges(); } /// returns true if there are any SVE callee saves. diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -326,9 +326,13 @@ return false; } - template bool ImmToTile(SDValue N, SDValue &Imm) { + template bool ImmToTile(SDValue N, SDValue &Imm) { if (auto *CI = dyn_cast(N)) { uint64_t C = CI->getZExtValue(); + + if (C > Max) + return false; + Imm = CurDAG->getRegister(BaseReg + C, MVT::Other); return true; } @@ -387,6 +391,8 @@ template void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, unsigned Op); + template + void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1601,10 +1607,11 @@ } enum class SelectTypeKind { - Int1 = 0, + AnyType = 0, Int = 1, FP = 2, - AnyType = 3, + Int1 = 3, + BF16 = 4 }; /// This function selects an opcode from a list of opcodes, which is @@ -1633,6 +1640,10 @@ if (EltVT != MVT::f16 && EltVT != MVT::f32 && EltVT != MVT::f64) return 0; break; + case SelectTypeKind::BF16: + if (EltVT != MVT::bf16) + return 0; + break; } unsigned Offset; @@ -1751,7 +1762,7 @@ void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, unsigned Opc_ri, unsigned Opc_rr, bool IsIntr) { - assert(Scale < 4 && "Invalid scaling value."); + assert(Scale < 5 && "Invalid scaling value."); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); @@ -1823,6 +1834,34 @@ SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) + if (Imm->getZExtValue() > Max) + return; + + SDValue ZtValue; + ImmToTile(Node->getOperand(2), ZtValue); + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned i = 0; i < NumOutVecs; ++i) + ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumOutVecs; + ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, unsigned Op) { SDLoc DL(N); @@ -4637,6 +4676,18 @@ case Intrinsic::aarch64_ld64b: SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0); return; + case Intrinsic::aarch64_sve_ld2q_sret: { + SelectPredicatedLoad(Node, 2, 4, AArch64::LD2Q_IMM, AArch64::LD2Q, true); + return; + } + case Intrinsic::aarch64_sve_ld3q_sret: { + SelectPredicatedLoad(Node, 3, 4, AArch64::LD3Q_IMM, AArch64::LD3Q, true); + return; + } + case Intrinsic::aarch64_sve_ld4q_sret: { + SelectPredicatedLoad(Node, 4, 4, AArch64::LD4Q_IMM, AArch64::LD4Q, true); + return; + } case Intrinsic::aarch64_sve_ld2_sret: { if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B, @@ -5000,6 +5051,88 @@ MF.getInfo()->setHasSwiftAsyncContext(true); return; } + case Intrinsic::aarch64_sme_luti2_lane_zt_x2: { + if (Subtarget->hasSME2p1()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_2ZTZI_B_PSEUDO, AArch64::LUTI2_2ZTZI_H_PSEUDO, + AArch64::LUTI2_2ZTZI_S})) + // Second Immediate must be <= 7: + SelectMultiVectorLuti<7>(Node, 2, Opc); + return; + } else if (Subtarget->hasSME2()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H, + AArch64::LUTI2_2ZTZI_S})) + // Second Immediate must be <= 7: + SelectMultiVectorLuti<7>(Node, 2, Opc); + return; + } else { + break; + } + } + case Intrinsic::aarch64_sme_luti4_lane_zt_x2: { + if (Subtarget->hasSME2p1()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI4_2ZTZI_B_PSEUDO, AArch64::LUTI4_2ZTZI_H_PSEUDO, + AArch64::LUTI4_2ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 2, Opc); + return; + } else if (Subtarget->hasSME2()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H, + AArch64::LUTI4_2ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 2, Opc); + return; + } else { + break; + } + } + case Intrinsic::aarch64_sme_luti2_lane_zt_x4: { + if (Subtarget->hasSME2p1()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_4ZTZI_B_PSEUDO, AArch64::LUTI2_4ZTZI_H_PSEUDO, + AArch64::LUTI2_4ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 4, Opc); + return; + } else if (Subtarget->hasSME2()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H, + AArch64::LUTI2_4ZTZI_S})) + // Second Immediate must be <= 3: + SelectMultiVectorLuti<3>(Node, 4, Opc); + return; + } else { + break; + } + } + case Intrinsic::aarch64_sme_luti4_lane_zt_x4: { + if (Subtarget->hasSME2p1()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::LUTI4_4ZTZI_H_PSEUDO, + AArch64::LUTI4_4ZTZI_S_PSEUDO})) + // Second Immediate must be <= 1: + SelectMultiVectorLuti<1>(Node, 4, Opc); + return; + } else if (Subtarget->hasSME2()) { + if (auto Opc = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S})) + // Second Immediate must be <= 1: + SelectMultiVectorLuti<1>(Node, 4, Opc); + return; + } else { + break; + } + } } } break; case ISD::INTRINSIC_WO_CHAIN: { @@ -5194,6 +5327,11 @@ AArch64::UMAX_VG2_2ZZ_S, AArch64::UMAX_VG2_2ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 2, false, Op); return; + case Intrinsic::aarch64_sve_bfmax_single_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG2_2ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, false, Op); + return; case Intrinsic::aarch64_sve_fmax_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5215,6 +5353,11 @@ AArch64::UMAX_VG4_4ZZ_S, AArch64::UMAX_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sve_bfmax_single_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG4_4ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, false, Op); + return; case Intrinsic::aarch64_sve_fmax_single_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5236,6 +5379,11 @@ AArch64::UMIN_VG2_2ZZ_S, AArch64::UMIN_VG2_2ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 2, false, Op); return; + case Intrinsic::aarch64_sve_bfmin_single_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG2_2ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, false, Op); + return; case Intrinsic::aarch64_sve_fmin_single_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5257,6 +5405,11 @@ AArch64::UMIN_VG4_4ZZ_S, AArch64::UMIN_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sve_bfmin_single_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG4_4ZZ_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, false, Op); + return; case Intrinsic::aarch64_sve_fmin_single_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5278,6 +5431,11 @@ AArch64::UMAX_VG2_2Z2Z_S, AArch64::UMAX_VG2_2Z2Z_D})) SelectDestructiveMultiIntrinsic(Node, 2, true, Op); return; + case Intrinsic::aarch64_sve_bfmax_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG2_2Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; case Intrinsic::aarch64_sve_fmax_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5299,6 +5457,11 @@ AArch64::UMAX_VG4_4Z4Z_S, AArch64::UMAX_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sve_bfmax_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMAX_VG4_4Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_fmax_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5320,6 +5483,11 @@ AArch64::UMIN_VG2_2Z2Z_S, AArch64::UMIN_VG2_2Z2Z_D})) SelectDestructiveMultiIntrinsic(Node, 2, true, Op); return; + case Intrinsic::aarch64_sve_bfmin_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG2_2Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 2, true, Op); + return; case Intrinsic::aarch64_sve_fmin_x2: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5341,6 +5509,11 @@ AArch64::UMIN_VG4_4Z4Z_S, AArch64::UMIN_VG4_4Z4Z_D})) SelectDestructiveMultiIntrinsic(Node, 4, true, Op); return; + case Intrinsic::aarch64_sve_bfmin_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), {0, AArch64::BFMIN_VG4_4Z2Z_H, 0, 0})) + SelectDestructiveMultiIntrinsic(Node, 4, true, Op); + return; case Intrinsic::aarch64_sve_fmin_x4: if (auto Op = SelectOpcodeFromVT( Node->getValueType(0), @@ -5849,6 +6022,18 @@ } break; } + case Intrinsic::aarch64_sve_st2q: { + SelectPredicatedStore(Node, 2, 4, AArch64::ST2Q, AArch64::ST2Q_IMM); + return; + } + case Intrinsic::aarch64_sve_st3q: { + SelectPredicatedStore(Node, 3, 4, AArch64::ST3Q, AArch64::ST3Q_IMM); + return; + } + case Intrinsic::aarch64_sve_st4q: { + SelectPredicatedStore(Node, 4, 4, AArch64::ST4Q, AArch64::ST4Q_IMM); + return; + } case Intrinsic::aarch64_sve_st2: { if (VT == MVT::nxv16i8) { SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM); @@ -6630,14 +6815,32 @@ return getPackedVectorTypeFromPredicateType( Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1); case Intrinsic::aarch64_sve_ld2_sret: - return getPackedVectorTypeFromPredicateType( + case Intrinsic::aarch64_sve_ld2q_sret: + return getPackedVectorTypeFromPredicateType( Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/2); + case Intrinsic::aarch64_sve_st2q: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(4)->getValueType(0), /*NumVec=*/2); case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld3q_sret: return getPackedVectorTypeFromPredicateType( Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/3); + case Intrinsic::aarch64_sve_st3q: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(5)->getValueType(0), /*NumVec=*/3); case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld4q_sret: return getPackedVectorTypeFromPredicateType( Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/4); + case Intrinsic::aarch64_sve_st4q: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(6)->getValueType(0), /*NumVec=*/4); + case Intrinsic::aarch64_sve_ld1udq: + case Intrinsic::aarch64_sve_st1udq: + return EVT(MVT::nxv1i64); + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_st1uwq: + return EVT(MVT::nxv1i32); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,16 +58,13 @@ CALL_BTI, // Function call followed by a BTI instruction. - // Essentially like a normal COPY that works on GPRs, but cannot be - // rematerialised by passes like the simple register coalescer. It's - // required for SME when lowering calls because we cannot allow frame - // index calculations using addvl to slip in between the smstart/smstop - // and the bl instruction. The scalable vector length may change across - // the smstart/smstop boundary. - OBSCURE_COPY, + COALESCER_BARRIER, + SMSTART, SMSTOP, RESTORE_ZA, + RESTORE_ZT, + SAVE_ZT, // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. @@ -374,6 +371,7 @@ GLD1_UXTW_SCALED_MERGE_ZERO, GLD1_SXTW_SCALED_MERGE_ZERO, GLD1_IMM_MERGE_ZERO, + GLD1Q_MERGE_ZERO, // Signed gather loads GLD1S_MERGE_ZERO, @@ -418,6 +416,7 @@ SST1_UXTW_SCALED_PRED, SST1_SXTW_SCALED_PRED, SST1_IMM_PRED, + SST1Q_PRED, // Non-temporal scatter store SSTNT1_PRED, @@ -603,10 +602,21 @@ MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitZASpillFill(MachineInstr &MI, MachineBasicBlock *BB, + bool IsSpill) const; + MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, + unsigned Opcode, bool IsZTDest) const; MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const; + MachineBasicBlock *EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * @@ -968,7 +978,7 @@ const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const; + SDValue ThisVal, bool RequiresSMChange) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2279,10 +2279,12 @@ switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; - MAKE_CASE(AArch64ISD::OBSCURE_COPY) + MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) + MAKE_CASE(AArch64ISD::RESTORE_ZT) + MAKE_CASE(AArch64ISD::SAVE_ZT) MAKE_CASE(AArch64ISD::CALL) MAKE_CASE(AArch64ISD::ADRP) MAKE_CASE(AArch64ISD::ADR) @@ -2519,6 +2521,7 @@ MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) + MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) @@ -2543,6 +2546,7 @@ MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) + MAKE_CASE(AArch64ISD::SST1Q_PRED) MAKE_CASE(AArch64ISD::ST1_PRED) MAKE_CASE(AArch64ISD::SST1_PRED) MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) @@ -2671,13 +2675,18 @@ return BB; } -MachineBasicBlock * -AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { +MachineBasicBlock *AArch64TargetLowering::EmitZASpillFill(MachineInstr &MI, + MachineBasicBlock *BB, + bool IsSpill) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); + BuildMI(*BB, MI, MI.getDebugLoc(), + TII->get(IsSpill ? AArch64::STR_ZA : AArch64::LDR_ZA)); - MIB.addReg(AArch64::ZA, RegState::Define); + if (IsSpill) + MIB.addReg(AArch64::ZA); + else + MIB.addReg(AArch64::ZA, RegState::Define); MIB.add(MI.getOperand(0)); // Vector select register MIB.add(MI.getOperand(1)); // Vector select offset MIB.add(MI.getOperand(2)); // Base @@ -2687,6 +2696,63 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // pn + MIB.add(MI.getOperand(2)); // pm + MIB.add(MI.getOperand(3)); // zn + MIB.add(MI.getOperand(4)); // zm + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // Slice index register + MIB.add(MI.getOperand(2)); // Slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned Opcode, + bool IsZTDest) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB; + + if (IsZTDest) + MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode), + MI.getOperand(0).getReg()); + else { + MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode)); + MIB.addReg(MI.getOperand(0).getReg()); + } + + for (unsigned I = 1; I < MI.getNumOperands(); ++I) + MIB.add(MI.getOperand(I)); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -2802,9 +2868,17 @@ case AArch64::LD1_MXIPXX_V_PSEUDO_Q: return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); case AArch64::LDR_ZA_PSEUDO: - return EmitFill(MI, BB); + return EmitZASpillFill(MI, BB, /*IsSpill=*/false); + case AArch64::STR_ZA_PSEUDO: + return EmitZASpillFill(MI, BB, /*IsSpill=*/true); + case AArch64::LDR_TX_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*IsZTDest=*/true); + case AArch64::STR_TX_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::STR_TX, /*IsZTDest=*/false); case AArch64::ZERO_M_PSEUDO: return EmitZero(MI, BB); + case AArch64::ZERO_T_PSEUDO: + return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*IsZTDest=*/true); } } @@ -5055,6 +5129,7 @@ case Intrinsic::aarch64_sve_clz: return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sme_get_live_za_slices: case Intrinsic::aarch64_sme_cntsb: return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), DAG.getConstant(1, dl, MVT::i32)); @@ -6312,6 +6387,17 @@ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); + // Set the reserved bytes (10-15) to zero + SDValue ReservedPtr = + DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(10, DL, Ptr.getValueType())); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), + ReservedPtr, MPI); + ReservedPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(12, DL, Ptr.getValueType())); + Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), + ReservedPtr, MPI); + return TPIDR2Obj; } @@ -6371,7 +6457,7 @@ (void)Res; } - SMEAttrs Attrs(MF.getFunction()); + SMEAttrs Attrs(MF.getFunction(), Subtarget->hasSME2()); bool IsLocallyStreaming = !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); @@ -6608,12 +6694,10 @@ // make sure it is Glued to the last CopyFromReg value. if (IsLocallyStreaming) { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); - Chain = DAG.getNode( - AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), - {DAG.getRoot(), - DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), - DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); + Chain = + changeStreamingMode(DAG, DL, /*Enable*/ true, DAG.getRoot(), Glue, + DAG.getConstant(0, DL, MVT::i64), /*Entry*/ true); + // Ensure that the SMSTART happens after the CopyWithChain such that its // chain result is used. for (unsigned I=0; IgetRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); // Conservatively assume the function requires the lazy-save mechanism. - if (SMEAttrs(MF.getFunction()).hasZAState()) { + if (SMEAttrs(MF.getFunction(), Subtarget->hasSME2()).hasZAState()) { unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); } @@ -6814,7 +6898,7 @@ SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const { + SDValue ThisVal, bool RequiresSMChange) const { DenseMap CopiedRegs; // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -6859,6 +6943,12 @@ break; } + if (RequiresSMChange && (VA.getValVT().isFixedLengthVector() || + (VA.getValVT().isFloatingPoint() && + !VA.getValVT().isScalableVector()))) + Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(), + Val); + InVals.push_back(Val); } @@ -6952,7 +7042,7 @@ // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMEAttrs CallerAttrs(MF.getFunction()); + SMEAttrs CallerAttrs(MF.getFunction(), Subtarget->hasSME2()); auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); if (CallerAttrs.requiresSMChange(CalleeAttrs) || CallerAttrs.requiresLazySave(CalleeAttrs)) @@ -7141,6 +7231,10 @@ SDValue AArch64TargetLowering::changeStreamingMode( SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setHasStreamingModeChanges(true); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); SDValue MSROp = @@ -7284,7 +7378,7 @@ } // Determine whether we need any streaming mode changes. - SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); + SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction(), Subtarget->hasSME2()); if (CLI.CB) CalleeAttrs = SMEAttrs(*CLI.CB); else if (std::optional Attrs = @@ -7293,22 +7387,29 @@ bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); + // The SMEABIPass handles lazy-saves around Invokes, so they do not need + // to be handled again here. + if (RequiresLazySave && CLI.CB && isa(CLI.CB)) + RequiresLazySave = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); if (RequiresLazySave) { // Set up a lazy save mechanism by storing the runtime live slices - // (worst-case N*N) to the TPIDR2 stack object. - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); + // to the TPIDR2 stack object. + SDValue LiveSlices = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, + DAG.getConstant(Intrinsic::aarch64_sme_get_live_za_slices, + DL, MVT::i32)); + unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue BufferPtrAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16); + Chain = + DAG.getTruncStore(Chain, DL, LiveSlices, BufferPtrAddr, MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), @@ -7321,6 +7422,20 @@ if (RequiresSMChange) PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); + bool PreserveZT = CallerAttrs.requiresPreservingZT(CalleeAttrs); + + SDValue ZTFrameIdx; + + if (PreserveZT) { + unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16)); + ZTFrameIdx = DAG.getFrameIndex( + ZTObj, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + + Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); + } + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) @@ -7486,8 +7601,11 @@ // Add an extra level of indirection for streaming mode changes by // using a pseudo copy node that cannot be rematerialised between a // smstart/smstop and the call by the simple register coalescer. - if (RequiresSMChange && isa(Arg)) - Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg); + if (RequiresSMChange && (Arg.getValueType().isFixedLengthVector() || + (Arg.getValueType().isFloatingPoint() && + !Arg.getValueType().isScalableVector()))) + Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, + Arg.getValueType(), Arg); RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); const TargetOptions &Options = DAG.getTarget().Options; @@ -7722,7 +7840,8 @@ // return. SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn, - IsThisReturn ? OutVals[0] : SDValue()); + IsThisReturn ? OutVals[0] : SDValue(), + RequiresSMChange.has_value()); if (!Ins.empty()) InGlue = Result.getValue(Result->getNumValues() - 1); @@ -7739,7 +7858,14 @@ AArch64ISD::SMSTART, DL, MVT::Other, Result, DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + } + + if (PreserveZT) + Result = DAG.getNode( + AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); + if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( @@ -7770,7 +7896,7 @@ DAG.getConstant(0, DL, MVT::i64)); } - if (RequiresSMChange || RequiresLazySave) { + if (RequiresSMChange || RequiresLazySave || PreserveZT) { for (unsigned I = 0; I < InVals.size(); ++I) { // The smstart/smstop is chained as part of the call, but when the // resulting chain is discarded (which happens when the call is not part @@ -7863,13 +7989,11 @@ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); // Emit SMSTOP before returning from a locally streaming function - SMEAttrs FuncAttrs(MF.getFunction()); + SMEAttrs FuncAttrs(MF.getFunction(), Subtarget->hasSME2()); if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { - Chain = DAG.getNode( - AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, - DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), - DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), - DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); + Chain = changeStreamingMode( + DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), + DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); Glue = Chain.getValue(1); } @@ -16558,7 +16682,8 @@ // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); - if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) && + N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); @@ -22240,8 +22365,11 @@ return SDValue(); // For FPs, ACLE only supports _packed_ single and double precision types. + // SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes if (SrcElVT.isFloatingPoint()) - if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) + if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) && + (Opcode != AArch64ISD::SST1Q_PRED || + ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16)))) return SDValue(); // Depending on the addressing mode, this is either a pointer or a vector of @@ -23160,6 +23288,8 @@ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); + case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO); case Intrinsic::aarch64_sve_ld1_gather_index: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SCALED_MERGE_ZERO); @@ -23203,6 +23333,8 @@ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_IMM_MERGE_ZERO); + case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED); case Intrinsic::aarch64_sve_st1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); case Intrinsic::aarch64_sve_st1_scatter_index: @@ -23241,6 +23373,19 @@ return DAG.getMergeValues( {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } + case Intrinsic::aarch64_sme_invoke_resume_pstatesm: + return changeStreamingMode(DAG, SDLoc(N), /*Enable*/ true, + /*Chain*/ N->getOperand(0), + /*Glue*/ N->getOperand(0).getValue(1), + N->getOperand(2), /*Entry*/ false); + case Intrinsic::aarch64_sme_ldr_zt: + return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), + DAG.getVTList(MVT::Other), N->getOperand(0), + N->getOperand(2), N->getOperand(3)); + case Intrinsic::aarch64_sme_str_zt: + return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N), + DAG.getVTList(MVT::Other), N->getOperand(0), + N->getOperand(2), N->getOperand(3)); default: break; } @@ -24682,7 +24827,7 @@ // Checks to allow the use of SME instructions if (auto *Base = dyn_cast(&Inst)) { - auto CallerAttrs = SMEAttrs(*Inst.getFunction()); + auto CallerAttrs = SMEAttrs(*Inst.getFunction(), Subtarget->hasSME2()); auto CalleeAttrs = SMEAttrs(*Base); if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/false) || diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -786,8 +786,11 @@ def Imm0_0Operand : AsmImmRange<0, 0>; def Imm0_1Operand : AsmImmRange<0, 1>; +def Imm1_1Operand : AsmImmRange<1, 1>; def Imm0_3Operand : AsmImmRange<0, 3>; +def Imm1_3Operand : AsmImmRange<1, 3>; def Imm0_7Operand : AsmImmRange<0, 7>; +def Imm1_7Operand : AsmImmRange<1, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; def Imm0_63Operand : AsmImmRange<0, 63>; @@ -1003,6 +1006,13 @@ let ParserMatchClass = Imm0_1Operand; } +// timm32_0_0 predicate - True if the 32-bit immediate is in the range [0,0] +def timm32_0_0 : Operand, TImmLeaf { + let ParserMatchClass = Imm0_0Operand; +} + // timm32_0_1 predicate - True if the 32-bit immediate is in the range [0,1] def timm32_0_1 : Operand, TImmLeaf, TImmLeaf { + let ParserMatchClass = Imm1_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand, ImmLeaf, TImmLeaf 0 && ((uint32_t)Imm) < 4; +}]> { + let ParserMatchClass = Imm1_3Operand; +} + // timm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7] def timm32_0_7 : Operand, TImmLeaf, TImmLeaf 0 && ((uint32_t)Imm) < 8; +}]> { + let ParserMatchClass = Imm1_7Operand; +} + // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand, ImmLeaf; + defm VectorIndex032b : VectorIndex; } } defm VectorIndex1 : VectorIndex; -def uimm3s8 : Operand, ImmLeaf, TImmLeaf= 0 && Imm <= 56 && ((Imm % 8) == 0); }], UImmS8XForm> { let PrintMethod = "printVectorIndex<8>"; let ParserMatchClass = UImm3s8Operand; @@ -12194,3 +12227,4 @@ def : TokenAlias<".S", ".s">; def : TokenAlias<".D", ".d">; def : TokenAlias<".Q", ".q">; +def : TokenAlias<".2B", ".2b">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -340,6 +340,8 @@ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3117,6 +3117,12 @@ MinOffset = 0; MaxOffset = 63; break; + case AArch64::LDR_ZA: + case AArch64::STR_ZA: + Scale = TypeSize::Scalable(16); + Width = SVEMaxBytesPerVector; + MinOffset = 0; + MaxOffset = 15; } return true; @@ -3603,8 +3609,10 @@ } // Copy a Z register pair by copying the individual sub-registers. - if (AArch64::ZPR2RegClass.contains(DestReg) && - AArch64::ZPR2RegClass.contains(SrcReg)) { + if ((AArch64::ZPR2RegClass.contains(DestReg) || + AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && + (AArch64::ZPR2RegClass.contains(SrcReg) || + AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -3624,8 +3632,10 @@ } // Copy a Z register quad by copying the individual sub-registers. - if (AArch64::ZPR4RegClass.contains(DestReg) && - AArch64::ZPR4RegClass.contains(SrcReg)) { + if ((AArch64::ZPR4RegClass.contains(DestReg) || + AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && + (AArch64::ZPR4RegClass.contains(SrcReg) || + AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2, AArch64::zsub3}; @@ -3956,7 +3966,8 @@ assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; - } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || + AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -3978,7 +3989,8 @@ assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; - } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || + AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -4112,7 +4124,8 @@ assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; - } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || + AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -4134,7 +4147,8 @@ assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; - } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || + AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -8440,6 +8454,58 @@ return AArch64::BLR; } +bool AArch64InstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getMF(); + const AArch64FunctionInfo &AFI = *MF.getInfo(); + + // If the function contains changes to streaming mode, then there + // is a danger that rematerialised instructions end up between + // instruction sequences (e.g. call sequences, or prolog/epilogue) + // where the streaming-SVE mode is temporarily changed. + if (AFI.hasStreamingModeChanges()) { + // Avoid rematerializing instructions that use/define scalable values, + // such as 'pfalse' or 'ptrue', which result in different results + // when the runtime vector length is different. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) { + if (MO.isFI() && + MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector) + return true; + if (!MO.isReg()) + return false; + + if (MO.getReg().isVirtual()) { + const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); + return AArch64::ZPRRegClass.hasSubClassEq(RC) || + AArch64::PPRRegClass.hasSubClassEq(RC); + } + return AArch64::ZPRRegClass.contains(MO.getReg()) || + AArch64::PPRRegClass.contains(MO.getReg()); + })) + return false; + + // Avoid rematerializing instructions that return a value that is + // different depending on vector length, even when it is not returned + // in a scalable vector/predicate register. + switch (MI.getOpcode()) { + default: + break; + case AArch64::RDVLI_XI: + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + case AArch64::CNTB_XPiI: + case AArch64::CNTH_XPiI: + case AArch64::CNTW_XPiI: + case AArch64::CNTD_XPiI: + return false; + } + } + + return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -185,6 +185,8 @@ /// The frame-index for the TPIDR2 object used for lazy saves. Register LazySaveTPIDR2Obj = 0; + /// Whether this function changes streaming mode within the function. + bool HasStreamingModeChanges = false; /// True if the function need unwind information. mutable std::optional NeedsDwarfUnwindInfo; @@ -447,6 +449,11 @@ bool needsDwarfUnwindInfo(const MachineFunction &MF) const; bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const; + bool hasStreamingModeChanges() const { return HasStreamingModeChanges; } + void setHasStreamingModeChanges(bool HasChanges) { + HasStreamingModeChanges = HasChanges; + } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64InstPrinter.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -440,6 +441,12 @@ Reserved.set(SubReg); } + if (MF.getSubtarget().hasSME2()) { + for (MCSubRegIterator SubReg(AArch64::ZT0, this, /*self=*/true); + SubReg.isValid(); ++SubReg) + Reserved.set(*SubReg); + } + markSuperRegs(Reserved, AArch64::FPCR); assert(checkAllSuperRegsMarked(Reserved)); @@ -987,6 +994,8 @@ MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { + MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + if (MI->isCopy() && ((DstRC->getID() == AArch64::GPR64RegClassID) || (DstRC->getID() == AArch64::GPR64commonRegClassID)) && @@ -995,5 +1004,38 @@ // which implements a 32 to 64 bit zero extension // which relies on the upper 32 bits being zeroed. return false; + + auto IsCoalescerBarrier = [](const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + return true; + default: + return false; + } + }; + + // For calls that temporarily have to toggle streaming mode as part of the + // call-sequence, we need to be more careful when coalescing copy instructions + // so that we don't end up coalescing the NEON/FP result or argument register + // with a whole Z-register, such that after coalescing the register allocator + // will try to spill/reload the entire Z register. + // + // We do this by checking if the node has any defs/uses that are COALESCER_BARRIER + // pseudos. These are 'nops' in practice, but they exist to instruct the + // coalescer to avoid coalescing the copy. + if (MI->isCopy() && SubReg != DstSubReg && + (AArch64::ZPRRegClass.hasSubClassEq(DstRC) || + AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) { + unsigned SrcReg = MI->getOperand(1).getReg(); + if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier)) + return false; + } + return true; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1250,6 +1250,10 @@ let EncoderMethod = "EncodeRegAsMultipleOf<2>", DecoderMethod = "DecodeZPR2Mul2RegisterClass" in { + def ZZ_mul_r : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<0, 2>; + } + def ZZ_b_mul_r : RegisterOperand"> { let ParserMatchClass = ZPRVectorListMul<8, 2>; } @@ -1331,16 +1335,16 @@ (trunc (rotl ZPR, 24), 4), (trunc (rotl ZPR, 28), 4) ]>; -def ZPR2Strided : RegisterClass<"AArch64", [untyped], 256, +def ZPR2Strided : RegisterClass<"AArch64", [untyped], 128, (add ZStridedPairsLo, ZStridedPairsHi)> { let Size = 256; } -def ZPR4Strided : RegisterClass<"AArch64", [untyped], 512, +def ZPR4Strided : RegisterClass<"AArch64", [untyped], 128, (add ZStridedQuadsLo, ZStridedQuadsHi)> { let Size = 512; } -def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 256, +def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128, (add ZStridedPairsLo, ZStridedPairsHi, (decimate ZSeqPairs, 2))> { let Size = 256; @@ -1387,7 +1391,7 @@ : RegisterOperand">; } -def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 512, +def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128, (add ZStridedQuadsLo, ZStridedQuadsHi, (decimate ZSeqQuads, 4))> { let Size = 512; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -22,8 +22,15 @@ [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPHasChain, SDNPSideEffect, SDNPVariadic, SDNPOptInGlue]>; +def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; +def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; -def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>; +def AArch64CoalescerBarrier + : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>; //===----------------------------------------------------------------------===// // Instruction naming conventions. @@ -186,9 +193,25 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; -def OBSCURE_COPY : Pseudo<(outs GPR64:$dst), (ins GPR64:$idx), []>, Sched<[]> { } -def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))), - (OBSCURE_COPY GPR64:$idx)>; +multiclass CoalescerBarrierPseudo vts> { + def NAME : Pseudo<(outs rc:$dst), (ins rc:$idx), []>, Sched<[]> { + let Constraints = "$dst = $idx"; + } + foreach vt = vts in { + def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$idx))), + (!cast(NAME) rc:$idx)>; + } +} + +multiclass CoalescerBarriers { + defm _FPR16 : CoalescerBarrierPseudo; + defm _FPR32 : CoalescerBarrierPseudo; + defm _FPR64 : CoalescerBarrierPseudo; + defm _FPR128 : CoalescerBarrierPseudo; +} + +defm COALESCER_BARRIER : CoalescerBarriers; + } // End let Predicates = [HasSME] // Pseudo to match to smstart/smstop. This expands: @@ -545,19 +568,19 @@ defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>; defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>; -def ZERO_T : sme2_zero_zt<"zero", 0b0001>; +defm ZERO_T : sme2_zero_zt<"zero", 0b0001>; -def LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100>; -def STR_TX : sme2_spill_fill_vector<"str", 0b11111100>; +defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>; +defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>; def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>; def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>; -defm LUTI2_ZTZI : sme2_luti2_vector_index<"luti2">; +defm LUTI2_ZTZI : sme2_luti2_vector_index<"luti2", int_aarch64_sme_luti2_lane_zt>; defm LUTI2_2ZTZI : sme2_luti2_vector_vg2_index<"luti2">; defm LUTI2_4ZTZI : sme2_luti2_vector_vg4_index<"luti2">; -defm LUTI4_ZTZI : sme2_luti4_vector_index<"luti4">; +defm LUTI4_ZTZI : sme2_luti4_vector_index<"luti4", int_aarch64_sme_luti4_lane_zt>; defm LUTI4_2ZTZI : sme2_luti4_vector_vg2_index<"luti4">; defm LUTI4_4ZTZI : sme2_luti4_vector_vg4_index<"luti4">; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -100,6 +100,8 @@ def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +// Gather vector base + scalar offset +def AArch64ld1q_gather_z: SDNode<"AArch64ISD::GLD1Q_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; // Contiguous stores - node definitions // @@ -132,6 +134,9 @@ def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; +// Scatter vector base + scalar offset +def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + // AArch64 SVE/SVE2 - the remaining node definitions // @@ -1157,7 +1162,7 @@ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>; defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>; let Predicates = [HasSVE2p1] in { - defm GLD1Q : sve_mem_128b_gld_64_unscaled<"ld1q">; + defm GLD1Q : sve_mem_128b_gld_64_unscaled<"ld1q", AArch64ld1q_gather_z>; } // Gathers using scaled 64-bit offsets, e.g. @@ -1338,6 +1343,49 @@ let Predicates = [HasSVE2p1] in { defm ST1D_Q : sve_mem_cst_ss<0b1110, "st1d", Z_q, ZPR128, GPR64NoXZRshifted64>; } + + multiclass sve_ld1q_pat { + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$imm))), + (RegImmInst PPR3bAny:$Pg, GPR64sp:$base, simm4s1:$imm)>; + } + + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst PPR3bAny:$Pg, GPR64sp:$base, GPR64:$offset)>; + } + + def _default : Pat<(Ty (Load1qOp (PredTy PPR3bAny:$Pg), (i64 GPR64sp:$base))), + (RegImmInst PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + } + + multiclass sve_st1q_pat { + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$imm)), + (RegImmInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, simm4s1:$imm)>; + } + + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, GPR64:$offset)>; + } + + def _default : Pat<(Store1qOp (DataType ZPR128:$Zt), (PredTy PPR3bAny:$Pg), (i64 GPR64sp:$base)), + (RegImmInst Z_q:$Zt, PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + } + + // ld1quw/st1quw + defm : sve_ld1q_pat; + defm : sve_ld1q_pat; + defm : sve_st1q_pat; + defm : sve_st1q_pat; + + // ld1qud/st1qud + defm : sve_ld1q_pat; + defm : sve_ld1q_pat; + defm : sve_st1q_pat; + defm : sve_st1q_pat; + } // End HasSVEorSME let Predicates = [HasSVE] in { @@ -1385,7 +1433,7 @@ defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>; defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>; let Predicates = [HasSVE2p1] in { - defm SST1Q : sve_mem_sst_128b_64_unscaled<"st1q">; + defm SST1Q : sve_mem_sst_128b_64_unscaled<"st1q", AArch64st1q_scatter>; } // Scatters using scaled 64-bit offsets, e.g. @@ -2806,6 +2854,7 @@ } defm Pat_Store_P16 : unpred_store_predicate; + defm Pat_Store_PredAsCount : unpred_store_predicate; multiclass unpred_load_predicate { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), @@ -2816,6 +2865,7 @@ } defm Pat_Load_P16 : unpred_load_predicate; + defm Pat_Load_PredAsCount : unpred_load_predicate; multiclass ld1 { @@ -3755,10 +3805,10 @@ defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>; defm FDOT_ZZZ_S : sve_float_dot<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; -def BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb">; -def BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt">; -def BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb">; -def BFMLSLT_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b111, "bfmlslt">; +defm BFMLSLB_ZZZ_S : sve_bfloat_matmul_longvecl<0b0, 0b1, "bfmlslb", int_aarch64_sve_bfmlslb>; +defm BFMLSLT_ZZZ_S : sve_bfloat_matmul_longvecl<0b1, 0b1, "bfmlslt", int_aarch64_sve_bfmlslt>; +defm BFMLSLB_ZZZI_S : sve_bfloat_matmul_longvecl_idx<0b0, 0b1, "bfmlslb", int_aarch64_sve_bfmlslb_lane>; +defm BFMLSLT_ZZZI_S : sve_bfloat_matmul_longvecl_idx<0b1, 0b1, "bfmlslt", int_aarch64_sve_bfmlslt_lane>; defm SDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"sdot", 0b0, int_aarch64_sve_sdot_x2>; defm UDOT_ZZZ_HtoS : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2>; @@ -3944,27 +3994,54 @@ //===----------------------------------------------------------------------===// let Predicates = [HasSVE2p1_or_HasSME2p1, HasB16B16] in { -def BFADD_ZZZ : sve_fp_3op_u_zd<0b00, 0b000, "bfadd", ZPR16>; -def BFSUB_ZZZ : sve_fp_3op_u_zd<0b00, 0b001, "bfsub", ZPR16>; -def BFMUL_ZZZ : sve_fp_3op_u_zd<0b00, 0b010, "bfmul", ZPR16>; -def BFMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b00, "bfmla", ZPR16>; -def BFMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b01, "bfmls", ZPR16>; + let Predicates = [UseExperimentalZeroingPseudos] in { + defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing; + } + +defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>; +defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>; + +defm BFADD_ZZZ : sve2p1_bf_3op_u_zd<0b000, "bfadd", fadd, AArch64fadd_p>; +defm BFSUB_ZZZ : sve2p1_bf_3op_u_zd<0b001, "bfsub", fsub, AArch64fsub_p>; +defm BFMUL_ZZZ : sve2p1_bf_3op_u_zd<0b010, "bfmul", fmul, AArch64fmul_p>; + +defm BFADD_ZPmZZ : sve2p1_bf_2op_p_zds<0b0000, "bfadd", "BFADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>; +defm BFSUB_ZPmZZ : sve2p1_bf_2op_p_zds<0b0001, "bfsub", "BFSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryComm>; +defm BFMUL_ZPmZZ : sve2p1_bf_2op_p_zds<0b0010, "bfmul", "BFMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>; + +defm BFADD_ZPZZ : sve2p1_bf_bin_pred_zds; +defm BFSUB_ZPZZ : sve2p1_bf_bin_pred_zds; +defm BFMUL_ZPZZ : sve2p1_bf_bin_pred_zds; + +defm BFMAXNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0100, "bfmaxnm", "BFMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; +defm BFMINNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0101, "bfminnm", "BFMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; + +defm BFMAX_ZPmZZ : sve2p1_bf_2op_p_zds<0b0110, "bfmax", "BFMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>; +defm BFMIN_ZPmZZ : sve2p1_bf_2op_p_zds<0b0111, "bfmin", "BFMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>; + +defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds; +defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds; + +defm BFMAX_ZPZZ : sve2p1_bf_bin_pred_zds; +defm BFMIN_ZPZZ : sve2p1_bf_bin_pred_zds; + +defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10, int_aarch64_sve_fmla_lane>; +defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11, int_aarch64_sve_fmls_lane>; -def BFADD_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0000, "bfadd", ZPR16>; -def BFSUB_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0001, "bfsub", ZPR16>; -def BFMUL_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0010, "bfmul", ZPR16>; -def BFMAXNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0100, "bfmaxnm", ZPR16>; -def BFMINNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0101, "bfminnm", ZPR16>; -def BFMAX_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0110, "bfmax", ZPR16>; -def BFMIN_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0111, "bfmin", ZPR16>; +defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>; -defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10>; -defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11>; +defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp>; -defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul">; +defm BFMLA_ZPZZZ : sve_fp_3op_pred_bf; +defm BFMLS_ZPZZZ : sve_fp_3op_pred_bf; -def BFCLAMP_ZZZ : sve2p1_fclamp<"bfclamp", 0b00, ZPR16>; } // End HasSVE2p1_or_HasSME2p1, HasB16B16 @@ -3972,31 +4049,32 @@ // SME2.1 or SVE2.1 instructions //===----------------------------------------------------------------------===// let Predicates = [HasSVE2p1_or_HasSME2p1] in { -defm FADDQV : sve2p1_fp_reduction_q<0b000, "faddqv">; -defm FMAXNMQV : sve2p1_fp_reduction_q<0b100, "fmaxnmqv">; -defm FMINNMQV : sve2p1_fp_reduction_q<0b101, "fminnmqv">; -defm FMAXQV : sve2p1_fp_reduction_q<0b110, "fmaxqv">; -defm FMINQV : sve2p1_fp_reduction_q<0b111, "fminqv">; +defm FADDQV : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_addqv>; +defm FMAXNMQV : sve2p1_fp_reduction_q<0b100, "fmaxnmqv", int_aarch64_sve_fmaxnmqv>; +defm FMINNMQV : sve2p1_fp_reduction_q<0b101, "fminnmqv", int_aarch64_sve_fminnmqv>; +defm FMAXQV : sve2p1_fp_reduction_q<0b110, "fmaxqv", int_aarch64_sve_fmaxqv>; +defm FMINQV : sve2p1_fp_reduction_q<0b111, "fminqv", int_aarch64_sve_fminqv>; defm DUPQ_ZZI : sve2p1_dupq<"dupq">; -def EXTQ_ZZI : sve2p1_extq<"extq">; - -defm PMOV_PZI : sve2p1_vector_to_pred<"pmov">; -defm PMOV_ZIP : sve2p1_pred_to_vector<"pmov">; - -defm ORQV_VPZ : sve2p1_int_reduce_q<0b1100, "orqv">; -defm EORQV_VPZ : sve2p1_int_reduce_q<0b1101, "eorqv">; -defm ANDQV_VPZ : sve2p1_int_reduce_q<0b1110, "andqv">; -defm ADDQV_VPZ : sve2p1_int_reduce_q<0b0001, "addqv">; -defm SMAXQV_VPZ : sve2p1_int_reduce_q<0b0100, "smaxqv">; -defm UMAXQV_VPZ : sve2p1_int_reduce_q<0b0101, "umaxqv">; -defm SMINQV_VPZ : sve2p1_int_reduce_q<0b0110, "sminqv">; -defm UMINQV_VPZ : sve2p1_int_reduce_q<0b0111, "uminqv">; - -defm TBXQ_ZZZ : sve2_int_perm_tbx<"tbxq", 0b10, null_frag>; -defm ZIPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b000, "zipq1">; -defm ZIPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b001, "zipq2">; -defm UZPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b010, "uzpq1">; -defm UZPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b011, "uzpq2">; -defm TBLQ_ZZZ : sve2p1_tblq<"tblq">; +defm EXTQ_ZZI : sve2p1_extq<"extq", int_aarch64_sve_extq_lane>; + +defm PMOV_PZI : sve2p1_vector_to_pred<"pmov", int_aarch64_sve_pmov_to_pred_lane>; +defm PMOV_ZIP : sve2p1_pred_to_vector<"pmov", int_aarch64_sve_pmov_to_vector_lane_merging, + int_aarch64_sve_pmov_to_vector_lane_zeroing>; + +defm ORQV_VPZ : sve2p1_int_reduce_q<0b1100, "orqv", int_aarch64_sve_orqv>; +defm EORQV_VPZ : sve2p1_int_reduce_q<0b1101, "eorqv", int_aarch64_sve_eorqv>; +defm ANDQV_VPZ : sve2p1_int_reduce_q<0b1110, "andqv", int_aarch64_sve_andqv>; +defm ADDQV_VPZ : sve2p1_int_reduce_q<0b0001, "addqv", int_aarch64_sve_addqv>; +defm SMAXQV_VPZ : sve2p1_int_reduce_q<0b0100, "smaxqv", int_aarch64_sve_smaxqv>; +defm UMAXQV_VPZ : sve2p1_int_reduce_q<0b0101, "umaxqv", int_aarch64_sve_umaxqv>; +defm SMINQV_VPZ : sve2p1_int_reduce_q<0b0110, "sminqv", int_aarch64_sve_sminqv>; +defm UMINQV_VPZ : sve2p1_int_reduce_q<0b0111, "uminqv", int_aarch64_sve_uminqv>; + +defm ZIPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b000, "zipq1", int_aarch64_sve_zipq1>; +defm ZIPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b001, "zipq2", int_aarch64_sve_zipq2>; +defm UZPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b010, "uzpq1", int_aarch64_sve_uzpq1>; +defm UZPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b011, "uzpq2", int_aarch64_sve_uzpq2>; +defm TBLQ_ZZZ : sve2p1_tblq<"tblq", int_aarch64_sve_tblq>; +defm TBXQ_ZZZ : sve2_int_perm_tbx<"tbxq", 0b10, int_aarch64_sve_tbxq>; } // End HasSVE2p1_or_HasSME2p1 diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -165,6 +165,11 @@ cl::desc("Enable SVE intrinsic opts"), cl::init(true)); +cl::opt + EnableSMEPeepholeOpt("enable-aarch64-sme-peephole-opt", cl::init(true), + cl::Hidden, + cl::desc("Perform SME peephole optimization")); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -218,6 +223,7 @@ initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); + initializeSMEPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); @@ -714,6 +720,9 @@ } void AArch64PassConfig::addMachineSSAOptimization() { + if (TM->getOptLevel() != CodeGenOpt::None && EnableSMEPeepholeOpt) + addPass(createSMEPeepholeOptPass()); + // Run default MachineSSAOptimization first. TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "AArch64PerfectShuffle.h" +#include "Utils/AArch64BaseInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" @@ -192,7 +193,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { - SMEAttrs CallerAttrs(*Caller); + SMEAttrs CallerAttrs(*Caller, ST->hasSME2()); SMEAttrs CalleeAttrs(*Callee); if (CallerAttrs.requiresSMChange(CalleeAttrs, /*BodyOverridesInterface=*/true) || diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -74,6 +74,7 @@ AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SLSHardening.cpp + SMEPeepholeOpt.cpp AArch64SelectionDAGInfo.cpp AArch64SpeculationHardening.cpp AArch64StackTagging.cpp diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -532,8 +532,8 @@ } SMEAttrs Attrs(F); - if (Attrs.hasNewZAInterface() || - (!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody())) + if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() || + Attrs.hasStreamingCompatibleInterface()) return true; return false; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -13,10 +13,13 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" #include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -40,14 +43,31 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + private: - bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder); + bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder, + bool ClearZTState); + bool handleExceptions(IRBuilder<> &Builder, Function *F, StructType *TPITy); + AllocaInst *createZABufferAndTPIDR2Block(IRBuilder<> &Builder, Function *F, + StructType *TPITy, + StringRef ObjName); + void setupLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *CheckBB, + BasicBlock *ResumeBB, BasicBlock *RestoreBB); + void restoreLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *RestoreBB, + BasicBlock *ResumeBB = nullptr, + Value *PStateOnEntry = nullptr); }; } // end anonymous namespace char SMEABI::ID = 0; static const char *name = "SME ABI Pass"; INITIALIZE_PASS_BEGIN(SMEABI, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(SMEABI, DEBUG_TYPE, name, false, false) FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } @@ -56,8 +76,42 @@ // Utility functions //===----------------------------------------------------------------------===// +// Utility function to emit a call to __arm_sme_state and return 'pstate.sm' value. +Value *emitGetPStateSM(Module *M, IRBuilder<> &Builder) { + auto *FTy = FunctionType::get( + StructType::create({Builder.getInt64Ty(), Builder.getInt64Ty()}), {}, + /*IsVarArgs=*/false); + auto Attrs = + AttributeList() + .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(M->getContext(), "aarch64_pstate_za_preserved"); + FunctionCallee Callee = M->getOrInsertFunction("__arm_sme_state", FTy, Attrs); + CallInst *Call = Builder.CreateCall(Callee); + Call->setCallingConv( + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2); + + // Extract PSTATE.SM from X0 + Value *X0 = Builder.CreateExtractValue(Call, 0); + return Builder.CreateAnd(X0, Builder.getInt64(1)); +} + +// Utility function to emit a call to __arm_tpidr2_restore. +void emitTPIDR2Restore(Module *M, IRBuilder<> &Builder, Value *TPIDR2Obj) { + auto *FTy = FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, + /*IsVarArgs=*/false); + auto Attrs = + AttributeList() + .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible") + .addFnAttribute(M->getContext(), "aarch64_pstate_za_shared"); + FunctionCallee Callee = M->getOrInsertFunction("__arm_tpidr2_restore", FTy, Attrs); + CallInst *Call = Builder.CreateCall( + Callee, Builder.CreatePointerCast(TPIDR2Obj, Builder.getInt8PtrTy())); + Call->setCallingConv( + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); +} + // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0. -void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { +void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ClearTPIDR2) { auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); auto Attrs = @@ -70,11 +124,280 @@ Call->setCallingConv( CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); - // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. + if (ClearTPIDR2) { + Function *WriteIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, + Builder.getInt64(0)); + } +} + +void SMEABI::restoreLazySave( + IRBuilder<> &Builder, Instruction *Call, AllocaInst *TPIObj, + BasicBlock *RestoreBB, BasicBlock *ResumeBB, Value *PStateOnEntry) { + Module *M = Call->getModule(); + + // If Call is an Invoke, restore the lazy save in the normal destination. + // Otherwise, restore the lazy save immediately after Call. + if (auto *II = dyn_cast(Call)) + Builder.SetInsertPoint(II->getNormalDest()->getFirstNonPHIOrDbg()); + else + Builder.SetInsertPoint(Call->getParent(), std::next(Call->getIterator())); + + // Re-enable pstate.za. + Function *EnableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + + // Create an intrinsic call to restore ZA, passing the 64-bit data pointer + // to the TPIDR2 block. + if (ResumeBB) + Builder.SetInsertPoint(RestoreBB); + emitTPIDR2Restore(M, Builder, TPIObj); + if (ResumeBB) + Builder.CreateBr(ResumeBB); +} + +void SMEABI::setupLazySave(IRBuilder<> &Builder, Instruction *Call, + AllocaInst *TPIObj, BasicBlock *CheckBB, + BasicBlock *ResumeBB, BasicBlock *RestoreBB) { + Module *M = Call->getModule(); + Builder.SetInsertPoint(Call); + + // Store the number of live slices to the num_za_save_slices field + // of the TPIDR2 block + Function *LiveIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_live_za_slices); + auto *Live = + Builder.CreateCall(LiveIntr->getFunctionType(), LiveIntr, {}, "live"); + auto *Trunc = Builder.CreateTrunc(Live, Builder.getInt16Ty(), "live.trunc"); + auto *TPILive = Builder.CreateGEP(TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(1)}, + "tpidr2.obj.live"); + Builder.CreateStore(Trunc, TPILive); + + auto *PtrToInt = + Builder.CreatePtrToInt(TPIObj, Builder.getInt64Ty(), "tpi.int"); + + if (dyn_cast(Call)) { + // Restart pstate.za if this is an Invoke, as we may be setting up + // a lazy-save in the exception handler. + // TODO: This will start pstate.za unnecessarily if the parent block is + // not an unwind destination. It might be possible to improve this by + // creating a mapping of blocks to ZA/SM states. + Function *EnableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + } + + // Set TPIDR2_EL0 to the new object Function *WriteIntr = Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, PtrToInt); + + if (!CheckBB) { + // If no CheckBB block was passed in, Call should be an Invoke with shared + // or preserved ZA and we don't need to restore the lazy-save unless we + // catch an exception. Abandon the lazy-save before resuming the function + // at the normal destination. + auto *Invoke = dyn_cast(Call); + assert(Invoke && "CheckBB has not been provided for restoring lazy-save."); + auto *FirstInst = Invoke->getNormalDest()->getFirstNonPHIOrDbg(); + auto *II = dyn_cast(FirstInst); + // Avoid setting TPIDR2_EL0 to null more than once. + if (!II || II->getIntrinsicID() != Intrinsic::aarch64_sme_set_tpidr2) { + Builder.SetInsertPoint(FirstInst); + Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, + ConstantInt::get(Builder.getInt64Ty(), 0)); + } + return; + } + + // Check if the lazy save has been committed by the callee and should + // be restored. + Builder.SetInsertPoint(CheckBB); + Function *GetEL0Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); + auto *GetEL0 = Builder.CreateCall(GetEL0Intr->getFunctionType(), GetEL0Intr, + {}, "tpidr2"); + auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, GetEL0, + ConstantInt::get(GetEL0->getType(), 0), "cmp"); + Builder.CreateCondBr(Cmp, RestoreBB, ResumeBB); + + // Conditionally store ZA after the call. + restoreLazySave(Builder, Call, TPIObj, RestoreBB, ResumeBB); + + // Ensure we only set TPIDR2_EL0 to null once. + auto *FirstInst = ResumeBB->getFirstNonPHIOrDbg(); + auto *II = dyn_cast(FirstInst); + if (II && II->getIntrinsicID() == Intrinsic::aarch64_sme_set_tpidr2) + return; + + // Set TPIDR2_EL0 to null before continuing with the rest of the function. + // This will already be null if ZA was restored above, but is necessary + // to abandon the lazy-save if the callee did not commit it. + Builder.SetInsertPoint(FirstInst); Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, - Builder.getInt64(0)); + ConstantInt::get(Builder.getInt64Ty(), 0)); + + return; +} + +AllocaInst *SMEABI::createZABufferAndTPIDR2Block( + IRBuilder<> &Builder, Function *F, StructType *TPITy, StringRef ObjName) { + Module *M = F->getParent(); + + Builder.SetInsertPoint(&*F->getEntryBlock().getFirstInsertionPt()); + auto *TPIObj = Builder.CreateAlloca(TPITy, nullptr, ObjName); + + // Allocate a buffer big enough to hold the max ZA size (SVL.B x SVL.B) + Function *NIntr = Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_cntsb); + auto *N = Builder.CreateCall(NIntr->getFunctionType(), NIntr, {}, "N"); + auto *NN = Builder.CreateMul(N, N, "NN"); + auto *Buffer = Builder.CreateAlloca(Builder.getInt8Ty(), NN, "buffer"); + Buffer->setAlignment(Align(16)); + + // Fill the za_save_buffer field of the new TPIDR2 block + auto *TPIBuffer = Builder.CreateGEP( + TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(0)}, ObjName + ".buffer"); + Builder.CreateStore(Buffer, TPIBuffer); + + // Clear reserved bytes 10-15 of the TPIDR2 block + auto *Reserved = Builder.CreateAlloca(Builder.getIntNTy(48), + Builder.getIntN(48, 0), "zero"); + Reserved->setAlignment(Align(16)); + TPIBuffer = + Builder.CreateGEP(TPIObj->getAllocatedType(), TPIObj, + {Builder.getInt64(0), Builder.getInt32(2)}, "reserved"); + Builder.CreateStore(Reserved, TPIBuffer); + + return TPIObj; +} + +bool SMEABI::handleExceptions(IRBuilder<> &Builder, Function *F, + StructType *TPITy) { + Module *M = F->getParent(); + LLVMContext &Context = F->getContext(); + SmallVector Invokes; + SmallVector Resumes; + SmallVector BeginCatchCalls; + SMEAttrs FnAttrs(*F); + + AllocaInst *TPIObj = nullptr; + + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + if (auto *Invoke = dyn_cast(&I)) + Invokes.push_back(Invoke); + else if (auto *Resume = dyn_cast(&I)) + Resumes.push_back(Resume); + else if (auto *Call = dyn_cast(&I)) { + if (Call->getCalledFunction() && + Call->getCalledFunction()->getName() == "__cxa_begin_catch") + BeginCatchCalls.push_back(Call); + } + } + } + + for (InvokeInst *Invoke : Invokes) { + // We only need to set up and restore a lazy-save if there is ZA state. + if (!FnAttrs.hasZAState()) + continue; + + if (!TPIObj) + TPIObj = + createZABufferAndTPIDR2Block(Builder, F, TPITy, "tpidr2.invoke.obj"); + + SMEAttrs InvokeAttrs(*Invoke); + if (InvokeAttrs.hasSharedZAInterface() || InvokeAttrs.preservesZA()) { + // If the Invoke instruction is shared or preserved ZA, setupLazySave + // does not need to restore the lazy-save at the normal destination. + setupLazySave(Builder, Invoke, TPIObj, nullptr, nullptr, nullptr); + } else { + // Otherwise, set up new blocks for restoring ZA if the Invoke was + // successful. Create a new block to read the value of TPIDR2 (CheckBB), + // which becomes the new normal destination for the instruction. + // Create another block to restore ZA (RestoreBB). ResumeBB is the + // original NormalDest for the Invoke. + auto *InvokeBB = Invoke->getParent(); + auto *ResumeBB = Invoke->getNormalDest(); + auto *CheckBB = BasicBlock::Create(Context, "check.za", F, ResumeBB); + Invoke->setNormalDest(CheckBB); + auto *RestoreBB = BasicBlock::Create(Context, "restore.za", F, ResumeBB); + // Update any successor PHI nodes to match the new blocks. + for (PHINode &PN : ResumeBB->phis()) { + PN.replaceIncomingBlockWith(InvokeBB, CheckBB); + PN.addIncoming(PN.getIncomingValueForBlock(CheckBB), RestoreBB); + } + ResumeBB->replaceSuccessorsPhiUsesWith(InvokeBB, ResumeBB); + // Set-up the lazy save for this Invoke. This also handles restoring the + // lazy save for the NormalDest. + setupLazySave(Builder, Invoke, TPIObj, CheckBB, ResumeBB, RestoreBB); + } + + if (!InvokeAttrs.hasNewZAInterface()) { + // New ZA functions are the only function types which will commit + // a lazy-save. For Invokes to any other type of function, create a + // call which saves ZA to ensure that we restore the correct state + // should the callee throw an unhandled exception that unwinds back + // to the caller. + Builder.SetInsertPoint(Invoke); + emitTPIDR2Save(M, Builder, InvokeAttrs.hasSharedZAInterface()); + } + } + + // We may need to restart streaming-mode depending on the starting value + // of pstate.sm on entry to this function (if streaming-compatible). + // Create a call to get this value in the entry block. + Value *PStateOnEntry = ConstantInt::get(Builder.getInt64Ty(), 1); + if (!Invokes.empty() && FnAttrs.hasStreamingCompatibleInterface()) { + Builder.SetInsertPoint(&*F->getEntryBlock().getFirstInsertionPt()); + PStateOnEntry = emitGetPStateSM(M, Builder); + } + + if (TPIObj) { + // Ensure we stop pstate.za before calling the resume instruction. + // TODO: We can improve this by not restoring the lazy-save & restarting + // pstate.za in the first place before blocks which terminate in a resume. + for (ResumeInst *Resume : Resumes) { + Builder.SetInsertPoint(Resume); + Function *DisableZAIntr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); + Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); + } + } + + // Restore state at the beginning of each catch block. + for (CallInst *Catch : BeginCatchCalls) { + Catch->addFnAttr(Attribute::get(Context, "aarch64_pstate_za_preserved")); + + // Restore the lazy-save if there is ZA state. + if (FnAttrs.hasZAState()) + restoreLazySave(Builder, Catch, TPIObj, Catch->getParent(), + /*ResumeBB*/ nullptr, PStateOnEntry); + + // Restart streaming-mode in the catch block if necessary. + if (FnAttrs.requiresSMChange(SMEAttrs())) { + auto *FirstInst = Catch->getParent()->getFirstNonPHIOrDbg(); + // Ensure that any catch blocks which are also landing pads keep the + // LandingPadInst as the first instruction in the block. + if (isa(FirstInst)) + FirstInst = FirstInst->getNextNonDebugInstruction(); + Builder.SetInsertPoint(FirstInst); + Function *EnableSMIntr = Intrinsic::getDeclaration( + M, Intrinsic::aarch64_sme_invoke_resume_pstatesm); + Builder.CreateCall(EnableSMIntr->getFunctionType(), EnableSMIntr, + PStateOnEntry); + } + } + + if (TPIObj || !FnAttrs.hasZAState()) { + F->addFnAttr("aarch64_expanded_pstate_za"); + return true; + } + + return false; } /// This function generates code to commit a lazy save at the beginning of a @@ -83,8 +406,8 @@ /// is active and we should call __arm_tpidr2_save to commit the lazy save. /// Additionally, PSTATE.ZA should be enabled at the beginning of the function /// and disabled before returning. -bool SMEABI::updateNewZAFunctions(Module *M, Function *F, - IRBuilder<> &Builder) { +bool SMEABI::updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder, + bool ClearZTState) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); @@ -104,7 +427,7 @@ // Create a call __arm_tpidr2_save, which commits the lazy save. Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder); + emitTPIDR2Save(M, Builder, /* Clear TPIDR2 */ true); // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); @@ -112,6 +435,14 @@ Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); + // If SME2, clear ZT0 on entry to the function, after enabling pstate.za + if (ClearZTState) { + Function *ClearZT0Intr = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); + Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, + {Builder.getInt32(0)}); + } + // Before returning, disable pstate.za for (BasicBlock &BB : *F) { Instruction *T = BB.getTerminator(); @@ -132,13 +463,27 @@ LLVMContext &Context = F.getContext(); IRBuilder<> Builder(Context); + TargetPassConfig &TPC = getAnalysis(); + + StructType *TPITy = + StructType::get(Context, {Builder.getInt8PtrTy(), Builder.getInt16Ty(), + ArrayType::get(Builder.getInt8Ty(), 6)}); + if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za")) return false; bool Changed = false; - SMEAttrs FnAttrs(F); + + const AArch64Subtarget *Subtarget = + TPC.getTM().getSubtargetImpl(F); + + SMEAttrs FnAttrs(F, Subtarget->hasSME2()); if (FnAttrs.hasNewZAInterface()) - Changed |= updateNewZAFunctions(M, &F, Builder); + Changed |= updateNewZAFunctions(M, &F, Builder, + FnAttrs.requiresPreservingZT(SMEAttrs())); + + if (FnAttrs.hasZAState() || FnAttrs.requiresSMChange(SMEAttrs())) + Changed |= handleExceptions(Builder, &F, TPITy); return Changed; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -10,11 +10,13 @@ // //===----------------------------------------------------------------------===// -def imm_to_tile8 : ComplexPattern", []>; -def imm_to_tile16 : ComplexPattern", []>; -def imm_to_tile32 : ComplexPattern", []>; -def imm_to_tile64 : ComplexPattern", []>; -def imm_to_tile128 : ComplexPattern", []>; +def imm_to_tile8 : ComplexPattern", []>; +def imm_to_tile16 : ComplexPattern", []>; +def imm_to_tile32 : ComplexPattern", []>; +def imm_to_tile64 : ComplexPattern", []>; +def imm_to_tile128 : ComplexPattern", []>; + +def imm_to_zt : ComplexPattern", []>; def tileslice8 : ComplexPattern", []>; def tileslice16 : ComplexPattern", []>; @@ -793,6 +795,15 @@ def : InstAlias(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO + : Pseudo<(outs), + (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + GPR64sp:$base), []>, + Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayStore = 1; + } // base def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), (!cast(NAME) ZA, $idx, 0, $base, 0)>; @@ -1254,8 +1265,9 @@ def : SVE_1_Op_Passthru_Pat(NAME)>; } +// FIXME: Why is there a tied operand here? There is no predication. class sve2_clamp sz, bit U, ZPRRegOp zpr_ty> - : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd), + : I<(outs zpr_ty:$Zd), (ins zpr_ty:$_Zd, zpr_ty:$Zn, zpr_ty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zm; @@ -2962,6 +2974,18 @@ let Inst{3-0} = opc; } +multiclass sme2_zero_zt opc> { + def NAME : sme2_zero_zt; + + def NAME # _PSEUDO + : Pseudo<(outs), (ins ZTR:$ZT), []>, Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + } + def : Pat<(int_aarch64_sme_zero_zt (imm_to_zt untyped:$zt)), + (!cast(NAME # _PSEUDO) $zt)>; +} + //===----------------------------------------------------------------------===// // SME2 lookup table load/store class sme2_spill_fill_vector opc> @@ -2981,10 +3005,23 @@ let mayStore = opc{7}; } +multiclass sme2_spill_fill_vector opc, SDPatternOperator op> { + def NAME : sme2_spill_fill_vector; + + def NAME # _PSEUDO + : Pseudo<(outs), (ins MatrixOp:$ZTt, GPR64sp:$base), []>, Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + } + + def : Pat<(op (imm_to_zt untyped:$tile), GPR64sp:$base), + (!cast(NAME # _PSEUDO) $tile, $base)>; +} + //===----------------------------------------------------------------------===/// // SME2 move to/from lookup table class sme2_movt_zt_to_scalar opc> - : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, uimm3s8:$imm3), + : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, tuimm3s8_32:$imm3), mnemonic, "\t$Rt, $ZTt$imm3", "", []>, Sched<[]> { bits<3> imm3; @@ -2996,7 +3033,7 @@ } class sme2_movt_scalar_to_zt opc> - : I<(outs ZTR:$ZTt), (ins uimm3s8:$imm3, GPR64:$Rt), + : I<(outs ZTR:$ZTt), (ins tuimm3s8_32:$imm3, GPR64:$Rt), mnemonic, "\t$ZTt$imm3, $Rt", "", []>, Sched<[]> { bits<3> imm3; @@ -3027,28 +3064,46 @@ class sme2_luti2_vector_index sz, RegisterOperand vector_ty, string mnemonic> - : sme2_luti_vector_index { + : sme2_luti_vector_index { bits<4> i; let Inst{17-14} = i; } -multiclass sme2_luti2_vector_index { +multiclass sme2_luti2_vector_index { def _B : sme2_luti2_vector_index<0b00, ZPR8, mnemonic>; def _H : sme2_luti2_vector_index<0b01, ZPR16, mnemonic>; def _S : sme2_luti2_vector_index<0b10, ZPR32, mnemonic>; + + def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _B) $zt, nxv16i8:$zn, (i32 VectorIndexB32b_timm:$imm))>; + + def : Pat<(nxv8i16 (intrinsic (imm_to_zt untyped:$zt), nxv8i16:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _H) $zt, nxv8i16:$zn, (i32 VectorIndexB32b_timm:$imm))>; + + def : Pat<(nxv4i32 (intrinsic (imm_to_zt untyped:$zt), nxv4i32:$zn, (i32 VectorIndexB32b_timm:$imm))), + (!cast(NAME # _S) $zt, nxv4i32:$zn, (i32 VectorIndexB32b_timm:$imm))>; } class sme2_luti4_vector_index sz, RegisterOperand vector_ty, string mnemonic> - : sme2_luti_vector_index { + : sme2_luti_vector_index { bits<3> i; let Inst{16-14} = i; } -multiclass sme2_luti4_vector_index { +multiclass sme2_luti4_vector_index { def _B : sme2_luti4_vector_index<0b00, ZPR8, mnemonic>; def _H : sme2_luti4_vector_index<0b01, ZPR16, mnemonic>; def _S : sme2_luti4_vector_index<0b10, ZPR32, mnemonic>; + + def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _B) $zt, nxv16i8:$zn, (i32 VectorIndexH32b_timm:$imm))>; + + def : Pat<(nxv8i16 (intrinsic (imm_to_zt untyped:$zt), nxv8i16:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _H) $zt, nxv8i16:$zn, (i32 VectorIndexH32b_timm:$imm))>; + + def : Pat<(nxv4i32 (intrinsic (imm_to_zt untyped:$zt), nxv4i32:$zn, (i32 VectorIndexH32b_timm:$imm))), + (!cast(NAME # _S) $zt, nxv4i32:$zn, (i32 VectorIndexH32b_timm:$imm))>; } // SME2 lookup table expand two contiguous registers @@ -3081,6 +3136,9 @@ def _B : sme2_luti2_vector_vg2_index<0b00, ZZ_b_mul_r, mnemonic>; def _H : sme2_luti2_vector_vg2_index<0b01, ZZ_h_mul_r, mnemonic>; def _S : sme2_luti2_vector_vg2_index<0b10, ZZ_s_mul_r, mnemonic>; + + def _B_PSEUDO : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexH:$i), []>; + def _H_PSEUDO : Pseudo<(outs ZZ_h_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexH:$i), []>; } class sme2_luti4_vector_vg2_index sz, RegisterOperand vector_ty, @@ -3094,6 +3152,9 @@ def _B : sme2_luti4_vector_vg2_index<0b00, ZZ_b_mul_r, mnemonic>; def _H : sme2_luti4_vector_vg2_index<0b01, ZZ_h_mul_r, mnemonic>; def _S : sme2_luti4_vector_vg2_index<0b10, ZZ_s_mul_r, mnemonic>; + + def _B_PSEUDO : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexH:$i), []>; + def _H_PSEUDO : Pseudo<(outs ZZ_h_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexH:$i), []>; } // SME2 lookup table expand four contiguous registers @@ -3126,6 +3187,9 @@ def _B : sme2_luti2_vector_vg4_index<0b00, ZZZZ_b_mul_r, mnemonic>; def _H : sme2_luti2_vector_vg4_index<0b01, ZZZZ_h_mul_r, mnemonic>; def _S : sme2_luti2_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>; + + def _B_PSEUDO : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexS:$i), []>; + def _H_PSEUDO : Pseudo<(outs ZZZZ_h_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexS:$i), []>; } class sme2_luti4_vector_vg4_index sz, RegisterOperand vector_ty, @@ -3138,6 +3202,9 @@ multiclass sme2_luti4_vector_vg4_index { def _H : sme2_luti4_vector_vg4_index<0b01, ZZZZ_h_mul_r, mnemonic>; def _S : sme2_luti4_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>; + + def _H_PSEUDO : Pseudo<(outs ZZZZ_h_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexS:$i), []>; + def _S_PSEUDO : Pseudo<(outs ZZZZ_s_strided_and_contiguous:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn, VectorIndexS:$i), []>; } //===----------------------------------------------------------------------===// @@ -4660,4 +4727,5 @@ multiclass sme2p1_luti4_vector_vg4_index { def _H: sme2p1_luti4_vector_vg4_index<0b01, ZZZZ_h_strided, VectorIndexD, mnemonic>; + def _S: sme2p1_luti4_vector_vg4_index<0b10, ZZZZ_s_strided, VectorIndexD, mnemonic>; } diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -0,0 +1,182 @@ +//===- SMEPeepholeOpt.cpp - SME peephole optimization pass ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64SMEAttributes.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-sme-peephole-opt" + +namespace { + +struct SMEPeepholeOpt : public MachineFunctionPass { + static char ID; + + SMEPeepholeOpt() : MachineFunctionPass(ID) { + initializeSMEPeepholeOptPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SME Peephole Optimization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool optimizeStartStopPairs(MachineBasicBlock &MBB) const; +}; + +char SMEPeepholeOpt::ID = 0; + +} // end anonymous namespace + +static bool isConditionalStartStop(const MachineInstr *MI) { + return MI->getOpcode() == AArch64::MSRpstatePseudo; +} + +static bool isMatchingStartStopPair(const MachineInstr *MI1, + const MachineInstr *MI2) { + // We only consider the same type of streaming mode change here, i.e. + // start/stop SM, or start/stop ZA pairs. + if (MI1->getOperand(0).getImm() != MI2->getOperand(0).getImm()) + return false; + + bool IsConditional = isConditionalStartStop(MI2); + if (isConditionalStartStop(MI1) != IsConditional) + return false; + + if (!IsConditional) + return true; + + // This optimisation is unlikely to happen in practice for conditional + // smstart/smstop pairs as the virtual registers for pstate.sm will always + // be different. + // TODO: For this optimisation to apply to conditional smstart/smstop, + // this pass will need to do more work to remove redundant calls to + // __arm_sme_state. + + // Only consider conditional start/stop pairs which read the same register + // holding the original value of pstate.sm, as some conditional start/stops + // require the state on entry to the function. + Register Reg1 = MI1->getOperand(2).getReg(); + Register Reg2 = MI2->getOperand(2).getReg(); + if (Reg1.isPhysical() || Reg2.isPhysical() || Reg1 != Reg2) + return false; + + // Ensure reg masks are identical. + if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask()) + return false; + + // Check to make sure the conditional start/stop pairs are identical. + return MI1->getOperand(3).getImm() == MI2->getOperand(3).getImm(); +} + +bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB) const { + SmallVector ToBeRemoved; + MachineInstr *Prev = nullptr; + bool PrevIsStart = false; + + for (MachineInstr &MI : make_early_inc_range(MBB)) { + // Walk through instructions in the block trying to find pairs of smstart + // and smstop nodes that cancel each other out. We only permit a limited + // set of instructions to appear between them, otherwise we reset our + // tracking. + switch (MI.getOpcode()) { + default: + Prev = nullptr; + break; + case AArch64::BL: { + // Permits calls to __arm_sme_state. + if (!MI.getOperand(0).isSymbol() || + strcmp(MI.getOperand(0).getSymbolName(), "__arm_sme_state")) + Prev = nullptr; + break; + } + case AArch64::COPY: { + // Permit copies of 32 and 64-bit registers. + if (!MI.getOperand(1).isReg()) { + Prev = nullptr; + break; + } + Register Reg = MI.getOperand(1).getReg(); + if (!AArch64::GPR32RegClass.contains(Reg) && + !AArch64::GPR64RegClass.contains(Reg)) + Prev = nullptr; + break; + } + case AArch64::ADJCALLSTACKDOWN: + case AArch64::ADJCALLSTACKUP: + case AArch64::ANDXri: + // We permit these as they don't generate SVE/NEON instructions. + break; + case AArch64::MSRpstatesvcrImm1: + case AArch64::MSRpstatePseudo: { + // Pairs of smstart/smstop nodes must either both be unconditional or + // both be conditional. + if (Prev && !isMatchingStartStopPair(Prev, &MI)) { + Prev = nullptr; + break; + } + + assert((MI.getOperand(1).getImm() < 2) && "Invalid SM state"); + bool CurIsStart = (MI.getOperand(1).getImm() != 0); + if (Prev && CurIsStart != PrevIsStart) { + ToBeRemoved.push_back(Prev); + ToBeRemoved.push_back(&MI); + } + + if (Prev) + Prev = nullptr; + else { + Prev = &MI; + PrevIsStart = CurIsStart; + } + break; + } + } + } + + for (MachineInstr *MI : ToBeRemoved) + MI->eraseFromParent(); + + return ToBeRemoved.size(); +} + +INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt", + "SME Peephole Optimization", false, false) + +bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + if (!MF.getSubtarget().hasSME()) + return false; + + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + bool Changed = false; + + // Even if the block lives in a function with no SME attributes attached we + // still have to analyze all the blocks because we may call a streaming + // function that requires smstart/smstop pairs. + for (MachineBasicBlock &MBB : MF) { + Changed |= optimizeStartStopPairs(MBB); + } + + return Changed; +} + +FunctionPass *llvm::createSMEPeepholeOptPass() { return new SMEPeepholeOpt(); } diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2118,6 +2118,31 @@ let mayRaiseFPException = 1; } + +multiclass sve2p1_bf_2op_p_zds opc, string asm, string Ps, + SDPatternOperator op, DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { +let DestructiveInstType = flags in { + def NAME : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_3_Op_Pat(NAME)>; +} + +multiclass sve2p1_bf_bin_pred_zds { + def _UNDEF : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF)>; +} + +multiclass sve2p1_bf_2op_p_zds_zeroing { + def _ZERO : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO)>; +} + + multiclass sve_fp_2op_p_zds opc, string asm, string Ps, SDPatternOperator op, DestructiveInstTypeEnum flags, string revname="", bit isReverseInstr=0> { @@ -2266,6 +2291,14 @@ def : SVE_2_Op_Pred_All_Active(NAME # _D)>; } +multiclass sve2p1_bf_3op_u_zd opc1, string asm, SDPatternOperator op, + SDPatternOperator predicated_op = null_frag> { + def NAME : sve_fp_3op_u_zd<0b00, opc1, asm, ZPR16>; + def : SVE_2_Op_Pat(NAME)>; + + def : SVE_2_Op_Pred_All_Active(NAME)>; +} + multiclass sve_fp_3op_u_zd_ftsmul opc, string asm, SDPatternOperator op> { def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; @@ -2324,6 +2357,14 @@ def : SVE_4_Op_Pat(NAME # _D)>; } +multiclass sve_fp_3op_p_zds_a_bf opc, string asm, string Ps, + SDPatternOperator op> { + def NAME : sve_fp_3op_p_zds_a<0b00, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + + def : SVE_4_Op_Pat(NAME)>; +} + class sve_fp_3op_p_zds_b sz, bits<2> opc, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za), @@ -2391,15 +2432,17 @@ let mayRaiseFPException = 1; } -multiclass sve2p1_fp_bfma_by_indexed_elem opc> { +multiclass sve2p1_fp_bfma_by_indexed_elem opc, SDPatternOperator op> { def NAME : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, - VectorIndexH32b> { + VectorIndexH32b_timm> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } + def : Pat<(nxv8bf16 (op nxv8bf16:$op1, nxv8bf16:$op2, nxv8bf16:$op3, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME) $op1, $op2, $op3, VectorIndexH32b_timm:$idx)>; } multiclass sve_fp_fma_by_indexed_elem opc, string asm, @@ -2456,7 +2499,7 @@ let mayRaiseFPException = 1; } -multiclass sve2p1_fp_bfmul_by_indexed_elem { +multiclass sve2p1_fp_bfmul_by_indexed_elem { def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; @@ -2464,6 +2507,9 @@ let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } + + def : Pat <(nxv8bf16 (ir_intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>; } multiclass sve_fp_fmul_by_indexed_elem { @@ -8797,6 +8843,47 @@ def : SVE_3_Op_Pat(NAME)>; } +class sve_bfloat_matmul_longvecl +: sve_bfloat_matmul { + let Inst{23} = 0b1; + let Inst{14} = 0b0; + let Inst{13} = sub; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl { + def NAME : sve_bfloat_matmul_longvecl; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl_idx +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH32b:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<3> iop; + let Inst{31-21} = 0b01100100111; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{15-14} = 0b01; + let Inst{13} = sub; + let Inst{12} = 0b0; + let Inst{11} = iop{0}; + let Inst{10} = BT; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +multiclass sve_bfloat_matmul_longvecl_idx { + def NAME : sve_bfloat_matmul_longvecl_idx; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + class sve_bfloat_convert : I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { @@ -9093,6 +9180,12 @@ def : SVE_4_Op_Pat(NAME # _D_UNDEF)>; } +multiclass sve_fp_3op_pred_bf { + def _UNDEF : PredThreeOpPseudo; + + def : SVE_4_Op_Pat(NAME # _UNDEF)>; +} + // Predicated pseudo integer two operand instructions. multiclass sve_int_bin_pred_bhsd { def _B_UNDEF : PredTwoOpPseudo; @@ -9178,6 +9271,11 @@ def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve2p1_bfclamp { + def NAME : sve2p1_fclamp; + def : SVE_3_Op_Pat(NAME)>; +} + // SVE two-way dot product class sve2p1_two_way_dot_vv : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), @@ -9739,11 +9837,30 @@ } -multiclass sve_mem_128b_gld_64_unscaled { +multiclass sve_mem_128b_gld_64_unscaled { def NAME : sve_mem_128b_gld_64_unscaled; def : InstAlias(NAME) Z_q:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + + def : Pat<(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2i64)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4i32)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv8i16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8i16)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv16i8 (op (nxv16i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv16i8)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + + def : Pat<(nxv2f64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2f64)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv4f32 (op (nxv4i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4f32)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv8f16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8f16)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(nxv8bf16 (op (nxv8i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8bf16)), + (!cast(NAME) PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_128b_64_unscaled @@ -9766,11 +9883,29 @@ } -multiclass sve_mem_sst_128b_64_unscaled { +multiclass sve_mem_sst_128b_64_unscaled { def NAME : sve_mem_sst_128b_64_unscaled; def : InstAlias(NAME) Z_q:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat<(op (nxv2i64 Z_q:$Zt), (nxv2i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2i64), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv4i32 Z_q:$Zt), (nxv4i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4i32), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv8i16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8i16), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp,ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv16i8 Z_q:$Zt), (nxv16i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv16i8), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + + def : Pat<(op (nxv2f64 Z_q:$Zt), (nxv2i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv2f64), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv4f32 Z_q:$Zt), (nxv4i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv4f32), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv8f16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8f16), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; + def : Pat<(op (nxv8bf16 Z_q:$Zt), (nxv8i1 PPR3bAny:$gp), (nxv2i64 ZPR64:$Zn), (i64 GPR64sp:$Rm), nxv8bf16), + (!cast(NAME) Z_q:$Zt, PPR3bAny:$gp, ZPR64:$Zn, GPR64:$Rm)>; } @@ -9860,12 +9995,15 @@ let mayRaiseFPException = 1; } -multiclass sve2p1_fp_reduction_q opc, string mnemonic> { +multiclass sve2p1_fp_reduction_q opc, string mnemonic, SDPatternOperator op> { def _H : sve2p1_fp_reduction_q<0b01, opc, mnemonic, ZPR16, "8h">; def _S : sve2p1_fp_reduction_q<0b10, opc, mnemonic, ZPR32, "4s">; def _D : sve2p1_fp_reduction_q<0b11, opc, mnemonic, ZPR64, "2d">; -} + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} // SVE Permute Vector - Quadwords (DUPQ) class sve2p1_dupq ind_tsz, string mnemonic, ZPRRegOp zprty, Operand itype> @@ -9905,7 +10043,7 @@ // SVE Permute Vector - Quadwords (EXTQ) class sve2p1_extq - : I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_15:$imm4), + : I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, timm32_0_15:$imm4), mnemonic, "\t$Zdn, $_Zdn, $Zm, $imm4", "", []>, Sched<[]> { bits<5> Zdn; @@ -9923,6 +10061,19 @@ let hasSideEffects = 0; } +multiclass sve2p1_extq { + // svuint8_t svextq_lane[_u8](svuint8_t zdn, svuint8_t zm, uint64_t imm); + def NAME : sve2p1_extq; + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; + + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; + def : SVE_3_Op_Imm_Pat(NAME)>; +} // SVE move predicate from vector class sve2p1_vector_to_pred opc, string mnemonic, @@ -9944,8 +10095,8 @@ let hasSideEffects = 0; } -multiclass sve2p1_vector_to_pred { - def _B : sve2p1_vector_to_pred<{0, 0, 0, 1}, mnemonic, PPR8, VectorIndex0>; +multiclass sve2p1_vector_to_pred { + def _B : sve2p1_vector_to_pred<{0, 0, 0, 1}, mnemonic, PPR8, VectorIndex032b>; def _H : sve2p1_vector_to_pred<{0, 0, 1, ?}, mnemonic, PPR16, VectorIndexD32b> { bits<1> index; let Inst{17} = index; @@ -9962,6 +10113,18 @@ def : InstAlias(NAME # _B) PPR8:$Pd, ZPRAny:$Zn, 0), 1>; + + def : Pat<(nxv16i1 (Op (nxv16i8 ZPRAny:$Zn), (i32 timm32_0_0:$Idx))), + (!cast(NAME # _B) ZPRAny:$Zn, timm32_0_0:$Idx)>; + + def : Pat<(nxv8i1 (Op (nxv8i16 ZPRAny:$Zn), (i32 timm32_0_1:$Idx))), + (!cast(NAME # _H) ZPRAny:$Zn, timm32_0_1:$Idx)>; + + def : Pat<(nxv4i1 (Op (nxv4i32 ZPRAny:$Zn), (i32 timm32_0_3:$Idx))), + (!cast(NAME # _S) ZPRAny:$Zn, timm32_0_3:$Idx)>; + + def : Pat<(nxv2i1 (Op (nxv2i64 ZPRAny:$Zn), (i32 timm32_0_7:$Idx))), + (!cast(NAME # _D) ZPRAny:$Zn, timm32_0_7:$Idx)>; } @@ -9985,8 +10148,9 @@ let hasSideEffects = 0; } -multiclass sve2p1_pred_to_vector { - def _B : sve2p1_pred_to_vector<{0, 0, 0, 1}, mnemonic, PPR8, VectorIndex0>; +multiclass sve2p1_pred_to_vector { + def _B : sve2p1_pred_to_vector<{0, 0, 0, 1}, mnemonic, PPR8, VectorIndex032b>; def _H : sve2p1_pred_to_vector<{0, 0, 1, ?}, mnemonic, PPR16, VectorIndexD32b> { bits<1> index; let Inst{17} = index; @@ -10003,6 +10167,24 @@ def : InstAlias(NAME # _B) ZPRAny:$Zd, 0, PPR8:$Pn), 1>; + + // Merge + def : Pat<(nxv8i16 (MergeOp (nxv8i16 ZPRAny:$Zd), (nxv8i1 PPR16:$Pn), (i32 timm32_1_1:$Idx))), + (!cast(NAME # _H) ZPRAny:$Zd, timm32_1_1:$Idx, PPR16:$Pn)>; + def : Pat<(nxv4i32 (MergeOp (nxv4i32 ZPRAny:$Zd), (nxv4i1 PPR32:$Pn), (i32 timm32_1_3:$Idx))), + (!cast(NAME # _S) ZPRAny:$Zd, timm32_1_3:$Idx, PPR32:$Pn)>; + def : Pat<(nxv2i64 (MergeOp (nxv2i64 ZPRAny:$Zd), (nxv2i1 PPR64:$Pn), (i32 timm32_1_7:$Idx))), + (!cast(NAME # _D) ZPRAny:$Zd, timm32_1_7:$Idx, PPR64:$Pn)>; + + // Zero + def : Pat<(nxv16i8 (ZeroOp (nxv16i1 PPR8:$Pn))), + (!cast(NAME # _B) (IMPLICIT_DEF), 0, PPR8:$Pn)>; + def : Pat<(nxv8i16 (ZeroOp (nxv8i1 PPR16:$Pn))), + (!cast(NAME # _H) (IMPLICIT_DEF), 0, PPR16:$Pn)>; + def : Pat<(nxv4i32 (ZeroOp (nxv4i1 PPR32:$Pn))), + (!cast(NAME # _S) (IMPLICIT_DEF), 0, PPR32:$Pn)>; + def : Pat<(nxv2i64 (ZeroOp (nxv2i1 PPR64:$Pn))), + (!cast(NAME # _D) (IMPLICIT_DEF), 0, PPR64:$Pn)>; } @@ -10029,11 +10211,16 @@ let hasSideEffects = 0; } -multiclass sve2p1_int_reduce_q opc, string mnemonic> { +multiclass sve2p1_int_reduce_q opc, string mnemonic, SDPatternOperator op> { def _B : sve2p1_int_reduce_q<0b00, opc, mnemonic, ZPR8, "16b">; def _H : sve2p1_int_reduce_q<0b01, opc, mnemonic, ZPR16, "8h">; def _S : sve2p1_int_reduce_q<0b10, opc, mnemonic, ZPR32, "4s">; def _D : sve2p1_int_reduce_q<0b11, opc, mnemonic, ZPR64, "2d">; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } @@ -10058,16 +10245,39 @@ let hasSideEffects = 0; } -multiclass sve2p1_permute_vec_elems_q opc, string mnemonic> { +multiclass sve2p1_permute_vec_elems_q opc, string mnemonic, SDPatternOperator op> { def _B : sve2p1_permute_vec_elems_q<0b00, opc, mnemonic, ZPR8, ZPR8>; def _H : sve2p1_permute_vec_elems_q<0b01, opc, mnemonic, ZPR16, ZPR16>; def _S : sve2p1_permute_vec_elems_q<0b10, opc, mnemonic, ZPR32, ZPR32>; def _D : sve2p1_permute_vec_elems_q<0b11, opc, mnemonic, ZPR64, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; } -multiclass sve2p1_tblq { +multiclass sve2p1_tblq { def _B : sve2p1_permute_vec_elems_q<0b00, 0b110, mnemonic, ZPR8, Z_b>; def _H : sve2p1_permute_vec_elems_q<0b01, 0b110, mnemonic, ZPR16, Z_h>; def _S : sve2p1_permute_vec_elems_q<0b10, 0b110, mnemonic, ZPR32, Z_s>; def _D : sve2p1_permute_vec_elems_q<0b11, 0b110, mnemonic, ZPR64, Z_d>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; } + diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -24,6 +24,8 @@ /// a call). class SMEAttrs { unsigned Bitmask; + bool HasSME2; + bool NoReturn; public: // Enum with bitmasks for each individual SME feature. @@ -38,13 +40,23 @@ All = ZA_Preserved - 1 }; - SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); } - SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {} + SMEAttrs(unsigned Mask = Normal, bool SME2 = false) + : Bitmask(0), HasSME2(false), NoReturn(false) { + set(Mask); + HasSME2 = SME2; + NoReturn = false; + } + SMEAttrs(const Function &F, bool SME2 = false) : SMEAttrs(F.getAttributes()) { + HasSME2 = SME2; + NoReturn = F.doesNotReturn(); + } SMEAttrs(const CallBase &CB); SMEAttrs(const AttributeList &L); void set(unsigned M, bool Enable = true); + bool hasNoReturn() const { return NoReturn; } + // Interfaces to query PSTATE.SM bool hasStreamingBody() const { return Bitmask & SM_Body; } bool hasStreamingInterface() const { return Bitmask & SM_Enabled; } @@ -72,6 +84,15 @@ requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface = false) const; + /// \return true if a call from Caller -> Callee requires ZT0 state to be + /// preserved. + /// ZT0 must be preserved if the caller has ZT state (SME2 is enabled and + /// the caller has ZA state) and the callee does not preserve ZT (SME2 is + /// enabled and the callee does not preserve ZA). + bool requiresPreservingZT(const SMEAttrs &Callee) const { + return HasSME2 && hasZTState() && !Callee.preservesZA(); + } + // Interfaces to query PSTATE.ZA bool hasNewZAInterface() const { return Bitmask & ZA_New; } bool hasSharedZAInterface() const { return Bitmask & ZA_Shared; } @@ -82,8 +103,10 @@ } bool requiresLazySave(const SMEAttrs &Callee) const { return hasZAState() && Callee.hasPrivateZAInterface() && - !Callee.preservesZA(); + !Callee.preservesZA() && !Callee.hasNoReturn(); } + + bool hasZTState() const { return hasZAState(); } }; } // namespace llvm diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -28,12 +28,16 @@ SMEAttrs::SMEAttrs(const CallBase &CB) { *this = SMEAttrs(CB.getAttributes()); - if (auto *F = CB.getCalledFunction()) - set(SMEAttrs(*F).Bitmask); + if (auto *F = CB.getCalledFunction()) { + SMEAttrs A(*F); + set(A.Bitmask); + NoReturn |= A.NoReturn; + } } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask = 0; + NoReturn = false; if (Attrs.hasFnAttr("aarch64_pstate_sm_enabled")) Bitmask |= SM_Enabled; if (Attrs.hasFnAttr("aarch64_pstate_sm_compatible")) @@ -46,6 +50,8 @@ Bitmask |= ZA_New; if (Attrs.hasFnAttr("aarch64_pstate_za_preserved")) Bitmask |= ZA_Preserved; + if (Attrs.hasFnAttr(Attribute::NoReturn)) + NoReturn = true; } std::optional diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -117,6 +117,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: SME Peephole Optimization pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -17,15 +17,15 @@ ; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstart sm -; CHECK-FISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: bl streaming_callee -; CHECK-FISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-FISEL-NEXT: smstop sm +; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0 ; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: fadd d0, d1, d0 ; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -43,15 +43,15 @@ ; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstart sm -; CHECK-GISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: bl streaming_callee -; CHECK-GISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-GISEL-NEXT: smstop sm -; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 +; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload +; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: fadd d0, d1, d0 ; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload @@ -68,92 +68,68 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" { -; CHECK-FISEL-LABEL: streaming_caller_nonstreaming_callee: -; CHECK-FISEL: // %bb.0: // %entry -; CHECK-FISEL-NEXT: sub sp, sp, #96 -; CHECK-FISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: smstop sm -; CHECK-FISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: bl normal_callee -; CHECK-FISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill -; CHECK-FISEL-NEXT: smstart sm -; CHECK-FISEL-NEXT: adrp x8, .LCPI1_0 -; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI1_0] -; CHECK-FISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: fadd d0, d1, d0 -; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-FISEL-NEXT: add sp, sp, #96 -; CHECK-FISEL-NEXT: ret -; -; CHECK-GISEL-LABEL: streaming_caller_nonstreaming_callee: -; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: sub sp, sp, #96 -; CHECK-GISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: smstop sm -; CHECK-GISEL-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: bl normal_callee -; CHECK-GISEL-NEXT: str d0, [sp, #88] // 8-byte Folded Spill -; CHECK-GISEL-NEXT: smstart sm -; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 -; CHECK-GISEL-NEXT: fmov d0, x8 -; CHECK-GISEL-NEXT: ldr d1, [sp, #88] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: fadd d0, d1, d0 -; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-GISEL-NEXT: add sp, sp, #96 -; CHECK-GISEL-NEXT: ret -entry: - %call = call double @normal_callee(double %x) - %add = fadd double %call, 4.200000e+01 - ret double %add -} - -define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" { -; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee: -; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee: +; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: sub sp, sp, #96 ; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: str d0, [sp, #88] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm -; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 -; CHECK-COMMON-NEXT: fmov d0, x8 ; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-COMMON-NEXT: fmov d0, x8 ; CHECK-COMMON-NEXT: fadd d0, d1, d0 -; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: ret +entry: + %call = call double @normal_callee(double %x) + %add = fadd double %call, 4.200000e+01 + ret double %add +} + +define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" { +; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #112 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: bl normal_callee +; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-COMMON-NEXT: fmov d0, x8 +; CHECK-COMMON-NEXT: fadd d0, d1, d0 +; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #112 ; CHECK-COMMON-NEXT: ret %call = call double @normal_callee(double %x); %add = fadd double %call, 4.200000e+01 @@ -236,6 +212,8 @@ ; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -246,7 +224,7 @@ ; CHECK-COMMON-NEXT: .LBB6_2: // %entry ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: bl za_shared_callee -; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: smstop za @@ -266,11 +244,12 @@ ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: subs x9, x9, x8 +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 @@ -285,7 +264,7 @@ ; CHECK-COMMON-NEXT: b .LBB7_2 ; CHECK-COMMON-NEXT: .LBB7_2: // %entry ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr -; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 +; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: mov sp, x29 @@ -304,14 +283,15 @@ ; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x9, x8 -; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: msub x8, x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za @@ -340,9 +320,9 @@ ; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-COMMON-NEXT: smstop sm -; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload +; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: smstart sm @@ -357,3 +337,103 @@ %res = fadd fp128 %a, %b ret fp128 %res } + +; As above this should use Selection DAG to make sure the libcall call is lowered correctly. +define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nounwind { +; CHECK-COMMON-LABEL: frem_call_za: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: msub x8, x9, x9, x8 +; CHECK-COMMON-NEXT: mov sp, x8 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: bl fmod +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: cbnz x8, .LBB10_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB10_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret + %res = frem double %a, %b + ret double %res +} + +; As above this should use Selection DAG to make sure the libcall is lowered correctly. +define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind { +; CHECK-COMMON-LABEL: frem_call_sm: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: bl fmodf +; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: ret + %res = frem float %a, %b + ret float %res +} + +; As above this should use Selection DAG to make sure the libcall is lowered correctly. +define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind { +; CHECK-COMMON-LABEL: frem_call_sm_compat: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #96 +; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl __arm_sme_state +; CHECK-COMMON-NEXT: and x19, x0, #0x1 +; CHECK-COMMON-NEXT: ldr s2, [sp, #8] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: smstop sm +; CHECK-COMMON-NEXT: .LBB12_2: +; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: bl fmodf +; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-COMMON-NEXT: tbz x19, #0, .LBB12_4 +; CHECK-COMMON-NEXT: // %bb.3: +; CHECK-COMMON-NEXT: smstart sm +; CHECK-COMMON-NEXT: .LBB12_4: +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #96 +; CHECK-COMMON-NEXT: ret + %res = frem float %a, %b + ret float %res +} diff --git a/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes-origin-bugreport.ll b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes-origin-bugreport.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes-origin-bugreport.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s | FileCheck %s + +; This test is a bit brittle, but it's very hard to construct a test where this +; happens, and perhaps better than no test at all. +; +; We want to avoid the following sequence from happening where the register +; allocator rematerialises the cntd within the call sequence. +; smstart sm +; bl foo +; cntd x8 +; smstop sm + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @avoid_remat_in_call_sequence(ptr %n, ptr %idx, i32 %0, ptr %list, i64 %idxprom35, ptr %1, ptr %2, ptr %vla25, %3, ptr %vla26) #0 { +; CHECK: cntd +; CHECK: smstart sm +; CHECK-NOT: cntd +; CHECK: bl foo +; CHECK-NOT: cntd +; CHECK: smstop sm +entry: + br label %for.body + +for.body: ; preds = %for.body31, %entry + %vla253 = alloca double, i64 0, align 8 + call void @foo() + %.tr = call i32 @llvm.vscale.i32() + %4 = shl i32 %.tr, 1 + br label %for.body31 + +for.body31: ; preds = %for.body31, %for.body + %5 = call @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 0, i32 %0) + %idx2 = getelementptr [12000000 x i32], ptr %list, i64 0, i64 %idxprom35 + %6 = call @llvm.masked.load.nxv2i32.p0(ptr %idx2, i32 1, %5, zeroinitializer) + %7 = sext %6 to + %8 = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( zeroinitializer, ptr %1, zeroinitializer) + %9 = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( zeroinitializer, ptr %2, %7) + %10 = call @llvm.masked.load.nxv2f64.p0(ptr %idx, i32 1, %3, zeroinitializer) + %11 = call @llvm.masked.load.nxv2f64.p0(ptr %vla26, i32 1, %3, zeroinitializer) + %12 = call @llvm.masked.load.nxv2f64.p0(ptr %n, i32 1, %3, zeroinitializer) + call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64( zeroinitializer, zeroinitializer, ptr %vla25, zeroinitializer) + %cmp29 = icmp slt i32 %4, 0 + br i1 %cmp29, label %for.body31, label %for.body +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32, i32) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, ptr, ) #2 + +declare void @foo() #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(, , ptr, ) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.vscale.i32() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.masked.load.nxv2i32.p0(ptr nocapture, i32 immarg, , ) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.masked.load.nxv2f64.p0(ptr nocapture, i32 immarg, , ) #2 + +; uselistorder directives +uselistorder ptr @llvm.aarch64.sve.ld1.gather.index.nxv2f64, { 1, 0 } +uselistorder ptr @llvm.masked.load.nxv2f64.p0, { 2, 1, 0 } + +attributes #0 = { "target-features"="+bf16,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+outline-atomics,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { "aarch64_pstate_sm_enabled" } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + + +define void @dont_rematerialize_cntd(i32 %N) #0 { +; CHECK-LABEL: dont_rematerialize_cntd: +; CHECK: cntd +; CHECK: smstop sm +; CHECK-NOT: cntd +; CHECK: bl foo +; CHECK: smstart sm +entry: + %cmp2 = icmp sgt i32 %N, 0 + br i1 %cmp2, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind + %.tr = call i32 @llvm.vscale.i32() + %conv = shl nuw nsw i32 %.tr, 4 + call void @foo(i32 %conv) + %inc = add nuw nsw i32 %index.03, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void +} + +; This test doesn't strictly make sense, because it passes a scalable predicate +; to a function, which makes little sense if the VL is not the same in/out of +; streaming-SVE mode. If the VL is known to be the same, then we could just as +; well rematerialize the `ptrue` inside the call sequence. However, the purpose +; of this test is more to ensure that the logic works, which may also trigger +; when the value is not being passed as argument (e.g. when it is hoisted from +; a loop and placed inside the call sequence). This happened in the original +; bug report we got for this, for which I've also added a test, but that test +; is more brittle. +; +; FIXME: This test also exposes another bug, where the 'mul vl' addressing mode +; is used before/after the smstop. This will be fixed in a separate patch. +define void @dont_rematerialize_ptrue(i32 %N) #0 { +; CHECK-LABEL: dont_rematerialize_ptrue: +; CHECK: ptrue [[PTRUE:p[0-9]+]].b +; CHECK: str [[PTRUE]], [[[SPILL_ADDR:.*]]] +; CHECK: smstop sm +; CHECK: ldr p0, [[[SPILL_ADDR]]] +; CHECK-NOT: ptrue +; CHECK: bl bar +; CHECK: smstart sm +entry: + %cmp2 = icmp sgt i32 %N, 0 + br i1 %cmp2, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind + %ptrue.ins = insertelement poison, i1 1, i32 0 + %ptrue = shufflevector %ptrue.ins, poison, zeroinitializer + call void @bar( %ptrue) + %inc = add nuw nsw i32 %index.03, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void +} +declare void @foo(i32) +declare void @bar() +declare i32 @llvm.vscale.i32() + +attributes #0 = { "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+sme,+sve" } diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode-llc.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -o - %S/sme-exceptions-with-streaming-mode.ll | FileCheck %s + +; CHECK-LABEL: no_za_streaming_enabled: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_3: // %lpad +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: b .LBB0_2 +; +; CHECK-LABEL: no_za_streaming_compatible: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz x20, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: tbz x20, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB1_6: // %return +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_7: // %lpad +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz x19, #0, .LBB1_9 +; CHECK-NEXT: // %bb.8: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_9: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_11 +; CHECK-NEXT: // %bb.10: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_11: // %lpad +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_13 +; CHECK-NEXT: // %bb.12: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_13: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_15 +; CHECK-NEXT: // %bb.14: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_15: // %lpad +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_17 +; CHECK-NEXT: // %bb.16: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_17: // %lpad +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: b .LBB1_6 diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-streaming-mode.ll @@ -0,0 +1,90 @@ +; RUN: opt -S -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s + +declare i32 @normal_callee() + +; Simple try-catch with no ZA state. No lazy-save is required, but we must restart pstate.sm in the +; exception handler. + +define i32 @no_za_streaming_enabled() "aarch64_pstate_sm_enabled" personality i32 1 { +; CHECK-LABEL: define {{[^@]+}}@no_za_streaming_enabled() #1 personality i32 1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; As above, but the function is streaming_compatible + +define i32 @no_za_streaming_compatible() "aarch64_pstate_sm_compatible" personality i32 1 { +; CHECK-LABEL: define {{[^@]+}}@no_za_streaming_compatible() #2 personality i32 1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call aarch64_sme_preservemost_from_x2 [[TMP0]] @__arm_sme_state() +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue [[TMP0]] [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() + +; CHECK: declare %0 @__arm_sme_state() #4 + +; CHECK: attributes #0 = { "target-features"="+sme" } +; CHECK: attributes #1 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" } +; CHECK: attributes #2 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "target-features"="+sme" } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #4 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" } +; CHECK: attributes #5 = { "aarch64_pstate_za_preserved" } diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state-llc.ll @@ -0,0 +1,1380 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs -o - %S/sme-exceptions-with-za-state.ll | FileCheck %s + +; CHECK-LABEL: private_za_invoke: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-112] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-100] +; CHECK-NEXT: sturh wzr, [x29, #-102] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #96 +; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur x10, [x29, #-70] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: // %bb.1: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB0_3 +; CHECK-NEXT: // %bb.2: // %restore.za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_3: // %return +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB0_4: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_5: // %lpad +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #112 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sturh w9, [x29, #-104] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #112 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB0_7 +; CHECK-NEXT: // %bb.6: // %lpad +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_7: // %lpad +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB0_4 +; +; CHECK-LABEL: new_za_invoke: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: stur wzr, [x29, #-100] +; CHECK-NEXT: sturh wzr, [x29, #-102] +; CHECK-NEXT: stur x10, [x29, #-112] +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #96 +; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur x10, [x29, #-70] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz x20, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: bl snap_new +; CHECK-NEXT: tbz x20, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: // %bb.5: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB1_7 +; CHECK-NEXT: // %bb.6: // %restore.za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_7: // %return +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB1_8: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_9: // %lpad +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz x19, #0, .LBB1_11 +; CHECK-NEXT: // %bb.10: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_11: // %lpad +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_13 +; CHECK-NEXT: // %bb.12: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_13: // %lpad +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_15 +; CHECK-NEXT: // %bb.14: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_15: // %lpad +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #112 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sturh w9, [x29, #-104] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz x19, #0, .LBB1_17 +; CHECK-NEXT: // %bb.16: // %lpad +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_17: // %lpad +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: tbz x19, #0, .LBB1_19 +; CHECK-NEXT: // %bb.18: // %lpad +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_19: // %lpad +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #112 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB1_21 +; CHECK-NEXT: // %bb.20: // %lpad +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_21: // %lpad +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB1_8 +; +; CHECK-LABEL: private_za_in_catch_block: +; CHECK: .Lfunc_begin2: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB2_2: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_3: // %lpad1 +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x19, x29, #48 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB2_5 +; CHECK-NEXT: // %bb.4: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_5: // %lpad1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB2_7 +; CHECK-NEXT: // %bb.6: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_7: // %lpad1 +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB2_2 +; +; CHECK-LABEL: private_za_in_catch_block_from_new_za: +; CHECK: .Lfunc_begin3: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %prelude +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %save.za +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB3_2: // %entry +; CHECK-NEXT: smstart za +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp9: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp10: +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: .LBB3_4: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_5: // %lpad1 +; CHECK-NEXT: .Ltmp11: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x19, x29, #48 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB3_7 +; CHECK-NEXT: // %bb.6: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_7: // %lpad1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB3_9 +; CHECK-NEXT: // %bb.8: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_9: // %lpad1 +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB3_4 +; +; CHECK-LABEL: call_after_catch_block: +; CHECK: .Lfunc_begin4: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp12: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp13: +; CHECK-NEXT: .LBB4_1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b fizz_shared +; CHECK-NEXT: .LBB4_2: // %lpad1 +; CHECK-NEXT: .Ltmp14: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB4_10 +; CHECK-NEXT: // %bb.3: // %catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp15: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp16: +; CHECK-NEXT: // %bb.4: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB4_6 +; CHECK-NEXT: // %bb.5: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_6: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_7: // %lpad2 +; CHECK-NEXT: .Ltmp17: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp18: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp19: +; CHECK-NEXT: // %bb.8: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB4_10 +; CHECK-NEXT: // %bb.9: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB4_10: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB4_11: // %terminate.lpad +; CHECK-NEXT: .Ltmp20: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: multiple_catch: +; CHECK: .Lfunc_begin5: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp21: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp22: +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w19, #42 // =0x2a +; CHECK-NEXT: .LBB5_2: // %return +; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_3: // %lpad1 +; CHECK-NEXT: .Ltmp23: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #2 +; CHECK-NEXT: b.ne .LBB5_6 +; CHECK-NEXT: // %bb.4: // %catch1 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp29: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp30: +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w19, #42 // =0x2a +; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: .LBB5_6: // %catch.fallthrough +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB5_16 +; CHECK-NEXT: // %bb.7: // %catch2 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp24: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp25: +; CHECK-NEXT: // %bb.8: +; CHECK-NEXT: mov w19, #23 // =0x17 +; CHECK-NEXT: .LBB5_9: // %return.sink.split +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB5_11 +; CHECK-NEXT: // %bb.10: // %return.sink.split +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_11: // %return.sink.split +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB5_2 +; CHECK-NEXT: .LBB5_12: // %lpad2 +; CHECK-NEXT: .Ltmp26: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp27: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp28: +; CHECK-NEXT: b .LBB5_14 +; CHECK-NEXT: .LBB5_13: // %lpad3 +; CHECK-NEXT: .Ltmp31: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp32: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp33: +; CHECK-NEXT: .LBB5_14: // %check.za13 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB5_16 +; CHECK-NEXT: // %bb.15: // %restore.za14 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_16: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB5_17: // %terminate.lpad +; CHECK-NEXT: .Ltmp34: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: try_throw: +; CHECK: .Lfunc_begin6: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x19, x19 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: sub x10, x29, #48 +; CHECK-NEXT: mov w0, #4 // =0x4 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w19, [x29, #-40] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_2: // %entry +; CHECK-NEXT: mov w9, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w19, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp35: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp36: +; CHECK-NEXT: // %bb.3: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_5 +; CHECK-NEXT: // %bb.4: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_5: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB6_6: // %lpad +; CHECK-NEXT: .Ltmp37: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp38: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp39: +; CHECK-NEXT: // %bb.7: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_9 +; CHECK-NEXT: // %bb.8: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_9: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b bar_shared +; CHECK-NEXT: .LBB6_10: // %lpad1 +; CHECK-NEXT: .Ltmp40: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp41: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp42: +; CHECK-NEXT: // %bb.11: // %check.za5 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB6_13 +; CHECK-NEXT: // %bb.12: // %restore.za6 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_13: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB6_14: // %terminate.lpad +; CHECK-NEXT: .Ltmp43: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: catch_throw: +; CHECK: .Lfunc_begin7: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp44: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp45: +; CHECK-NEXT: // %bb.1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: b bar_shared +; CHECK-NEXT: .LBB7_2: // %lpad1 +; CHECK-NEXT: .Ltmp46: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: mov w0, #4 // =0x4 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: sturh w20, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB7_4 +; CHECK-NEXT: // %bb.3: // %lpad1 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_4: // %lpad1 +; CHECK-NEXT: mov w9, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp47: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp48: +; CHECK-NEXT: // %bb.5: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB7_7 +; CHECK-NEXT: // %bb.6: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_7: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB7_8: // %lpad2 +; CHECK-NEXT: .Ltmp49: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp50: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp51: +; CHECK-NEXT: // %bb.9: // %check.za5 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB7_11 +; CHECK-NEXT: // %bb.10: // %restore.za6 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB7_11: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB7_12: // %terminate.lpad +; CHECK-NEXT: .Ltmp52: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: nested: +; CHECK: .Lfunc_begin8: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x19, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp53: +; CHECK-NEXT: bl foo_shared +; CHECK-NEXT: .Ltmp54: +; CHECK-NEXT: // %bb.1: // %invoke.cont1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x19 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp56: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp57: +; CHECK-NEXT: .LBB8_2: // %try.cont14 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_3: // %lpad2 +; CHECK-NEXT: .Ltmp58: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #2 +; CHECK-NEXT: b.ne .LBB8_14 +; CHECK-NEXT: // %bb.4: // %catch1 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp59: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp60: +; CHECK-NEXT: // %bb.5: // %invoke.cont2 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp64: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp65: +; CHECK-NEXT: // %bb.6: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_2 +; CHECK-NEXT: // %bb.7: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB8_2 +; CHECK-NEXT: .LBB8_8: // %lpad4 +; CHECK-NEXT: .Ltmp66: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: b .LBB8_12 +; CHECK-NEXT: .LBB8_9: // %lpad3 +; CHECK-NEXT: .Ltmp61: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp62: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp63: +; CHECK-NEXT: // %bb.10: // %check.za13 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_12 +; CHECK-NEXT: // %bb.11: // %restore.za14 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_12: // %eh.cleanup +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB8_14 +; CHECK-NEXT: .LBB8_13: // %lpad1 +; CHECK-NEXT: .Ltmp55: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: .LBB8_14: // %catch2 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp67: +; CHECK-NEXT: bl buzz_shared +; CHECK-NEXT: .Ltmp68: +; CHECK-NEXT: // %bb.15: // %invoke.cont3 +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_17 +; CHECK-NEXT: // %bb.16: // %invoke.cont3 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_17: // %invoke.cont3 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB8_2 +; CHECK-NEXT: .LBB8_18: // %lpad5 +; CHECK-NEXT: .Ltmp69: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp70: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp71: +; CHECK-NEXT: // %bb.19: // %check.za25 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB8_21 +; CHECK-NEXT: // %bb.20: // %restore.za26 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB8_21: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB8_22: // %terminate.lpad +; CHECK-NEXT: .Ltmp72: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: conditional_throw: +; CHECK: .Lfunc_begin9: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: tbnz w0, #0, .LBB9_3 +; CHECK-NEXT: // %bb.1: // %if.end +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp73: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp74: +; CHECK-NEXT: .LBB9_2: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_3: // %if.then +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: mov w0, #4 // =0x4 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: sturh w19, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_allocate_exception +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEXT: cbnz x9, .LBB9_5 +; CHECK-NEXT: // %bb.4: // %if.then +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_5: // %if.then +; CHECK-NEXT: mov w9, #23 // =0x17 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: sturh w19, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp75: +; CHECK-NEXT: adrp x1, :got:except +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: ldr x1, [x1, :got_lo12:except] +; CHECK-NEXT: bl __cxa_throw +; CHECK-NEXT: .Ltmp76: +; CHECK-NEXT: // %bb.6: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_8 +; CHECK-NEXT: // %bb.7: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_8: // %unreachable +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB9_9: // %lpad +; CHECK-NEXT: .Ltmp77: +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp78: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp79: +; CHECK-NEXT: // %bb.10: // %invoke.cont2 +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_12 +; CHECK-NEXT: // %bb.11: // %invoke.cont2 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_12: // %invoke.cont2 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB9_2 +; CHECK-NEXT: .LBB9_13: // %lpad1 +; CHECK-NEXT: .Ltmp80: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp81: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp82: +; CHECK-NEXT: // %bb.14: // %check.za9 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB9_16 +; CHECK-NEXT: // %bb.15: // %restore.za10 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB9_16: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB9_17: // %terminate.lpad +; CHECK-NEXT: .Ltmp83: +; CHECK-NEXT: bl __clang_call_terminate +; +; CHECK-LABEL: throw_in_signature: +; CHECK: .Lfunc_begin10: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x8, x8, #15 +; CHECK-NEXT: stur x9, [x29, #-48] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x8, x8, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #32 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: stur x9, [x29, #-6] +; CHECK-NEXT: tbz w0, #0, .LBB10_3 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp86: +; CHECK-NEXT: bl bar_shared +; CHECK-NEXT: .Ltmp87: +; CHECK-NEXT: // %bb.2: +; CHECK-NEXT: mov w0, #42 // =0x2a +; CHECK-NEXT: b .LBB10_5 +; CHECK-NEXT: .LBB10_3: // %if.end +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp84: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp85: +; CHECK-NEXT: // %bb.4: +; CHECK-NEXT: mov w0, #23 // =0x17 +; CHECK-NEXT: .LBB10_5: // %return +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_6: // %lpad +; CHECK-NEXT: .Ltmp88: +; CHECK-NEXT: bl __cxa_call_unexpected +; +; CHECK-LABEL: try_func: +; CHECK: .Lfunc_begin11: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x10, x10, x9 +; CHECK-NEXT: mov sp, x10 +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: stur x10, [x29, #-48] +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: stur wzr, [x29, #-36] +; CHECK-NEXT: sturh wzr, [x29, #-38] +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #32 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur x10, [x29, #-6] +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp89: +; CHECK-NEXT: bl fizz_shared +; CHECK-NEXT: .Ltmp90: +; CHECK-NEXT: .LBB11_1: // %try.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_2: // %lpad +; CHECK-NEXT: .Ltmp91: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.ne .LBB11_10 +; CHECK-NEXT: // %bb.3: // %catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .Ltmp92: +; CHECK-NEXT: bl buzz_shared +; CHECK-NEXT: .Ltmp93: +; CHECK-NEXT: // %bb.4: // %invoke.cont +; CHECK-NEXT: sub x8, x29, #48 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sturh w9, [x29, #-40] +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: sub x0, x29, #48 +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB11_6 +; CHECK-NEXT: // %bb.5: // %invoke.cont +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB11_6: // %invoke.cont +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_7: // %lpad1 +; CHECK-NEXT: .Ltmp94: +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: smstart za +; CHECK-NEXT: msr TPIDR2_EL0, x20 +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: .Ltmp95: +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: .Ltmp96: +; CHECK-NEXT: // %bb.8: // %check.za +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbnz x8, .LBB11_10 +; CHECK-NEXT: // %bb.9: // %restore.za +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB11_10: // %eh.resume +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: .LBB11_11: // %terminate.lpad +; CHECK-NEXT: .Ltmp97: +; CHECK-NEXT: bl __clang_call_terminate diff --git a/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-exceptions-with-za-state.ll @@ -0,0 +1,1609 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s + +declare void @foo_shared() "aarch64_pstate_za_shared" +declare void @bar_shared() "aarch64_pstate_za_shared" +declare void @fizz_shared() "aarch64_pstate_za_shared" +declare void @buzz_shared() "aarch64_pstate_za_shared" + +declare void @snap_new() "aarch64_pstate_za_new" + +declare i32 @normal_callee() + +@except = external constant ptr +@invalid_argument = external constant ptr +@overflow_error = external constant ptr +@runtime_error = external constant ptr + +; Try/Catch with one invoke of a private ZA function. +; Lazy-save must be set up for the invoke and for the private callee. + +define i32 @private_za_invoke() "aarch64_pstate_za_shared" "aarch64_pstate_sm_enabled" personality i32 1 { +; CHECK-LABEL: @private_za_invoke( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @normal_callee() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[RETURN]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[CHECK_ZA]] ], [ 15, [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @normal_callee() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a new ZA function. + +define i32 @new_za_invoke() "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" personality i32 1 { +; CHECK-LABEL: @new_za_invoke( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call aarch64_sme_preservemost_from_x2 [[TMP0]] @__arm_sme_state() +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue [[TMP0]] [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: invoke void @snap_new() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[RETURN]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD]] ], [ 15, [[CHECK_ZA]] ], [ 15, [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @snap_new() to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a shared ZA function and a private ZA call in the +; (unconditional) catch block. + +; int unconditional() { +; try { +; foo(); +; } catch (...) { +; normal_callee(); +; return 23; +; } +; return 15; +; } + +define i32 @private_za_in_catch_block() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @private_za_in_catch_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD1]] ], [ 15, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @foo_shared() to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + call void @normal_callee() + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad1 ], [ 15, %entry ] + ret i32 %retval +} + +; Try/Catch with one invoke of a shared ZA function and a private ZA call in the +; (unconditional) catch block. + +; int unconditional() { +; try { +; foo(); +; } catch (...) { +; normal_callee(); +; return 23; +; } +; return 15; +; } + +define i32 @private_za_in_catch_block_from_new_za() "aarch64_pstate_za_new" personality i32 1 { +; CHECK-LABEL: @private_za_in_catch_block_from_new_za( +; CHECK-NEXT: prelude: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[SAVE_ZA:%.*]], label [[ENTRY:%.*]] +; CHECK: save.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: br label [[ENTRY]] +; CHECK: entry: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 23, [[LPAD1]] ], [ 15, [[ENTRY]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + invoke void @foo_shared() to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + call void @normal_callee() + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 23, %lpad1 ], [ 15, %entry ] + ret i32 %retval +} + + +; +; Try/Catch with one catch clause. Set up & restore lazy-save +; +; void except_func() { +; try { +; foo(); +; } catch (const std::invalid_argument& e) { +; bar(); +; } +; fizz(); +; } + +define void @call_after_catch_block() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @call_after_catch_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @invalid_argument +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @invalid_argument) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @fizz_shared() +; CHECK-NEXT: ret void +; CHECK: lpad2: +; CHECK-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL6_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[LPAD1]] ], [ [[TMP5]], [[CHECK_ZA]] ], [ [[TMP5]], [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL6_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP6:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP6]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP7]]) +; CHECK-NEXT: unreachable +; + +entry: + invoke void @foo_shared() to label %try.cont unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } catch ptr @invalid_argument + %1 = extractvalue { ptr, i32 } %0, 1 + %2 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @invalid_argument) + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %catch, label %eh.resume + +catch: + %3 = extractvalue { ptr, i32 } %0, 0 + %4 = tail call ptr @__cxa_begin_catch(ptr %3) + invoke void @bar_shared() to label %invoke.cont unwind label %lpad2 + +invoke.cont: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + tail call void @fizz_shared() + ret void + +lpad2: + %5 = landingpad { ptr, i32 } cleanup + invoke void @__cxa_end_catch() to label %eh.resume unwind label %terminate.lpad + +eh.resume: + %lpad.val6.merged = phi { ptr, i32 } [ %0, %lpad1 ], [ %5, %lpad2 ] + resume { ptr, i32 } %lpad.val6.merged + +terminate.lpad: + %6 = landingpad { ptr, i32 } catch ptr null + %7 = extractvalue { ptr, i32 } %6, 0 + tail call void @__clang_call_terminate(ptr %7) + unreachable +} + +; +; Try/Catch with two catch clauses. Set up & restore lazy-save +; +; int multiple_catch() { +; try { +; foo(); +; } catch (const std::overflow_error& e) { +; bar(); +; } catch (const std::runtime_error& e) { +; fizz(); +; return 23; +; } +; return 42; +; } + +define i32 @multiple_catch() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @multiple_catch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @overflow_error +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @overflow_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH1:%.*]], label [[CATCH_FALLTHROUGH:%.*]] +; CHECK: catch1: +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[RETURN_SINK_SPLIT:%.*]] unwind label [[LPAD3:%.*]] +; CHECK: catch.fallthrough: +; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES1:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]] +; CHECK-NEXT: br i1 [[MATCHES1]], label [[CATCH2:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch2: +; CHECK-NEXT: [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[RETURN_SINK_SPLIT]] unwind label [[LPAD2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE9:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC10:%.*]] = trunc i64 [[LIVE9]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE11:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC10]], ptr [[TPIDR2_OBJ_LIVE11]], align 2 +; CHECK-NEXT: [[TPI_INT12:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT12]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: lpad3: +; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE15:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC16:%.*]] = trunc i64 [[LIVE15]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE17:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC16]], ptr [[TPIDR2_OBJ_LIVE17]], align 2 +; CHECK-NEXT: [[TPI_INT18:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT18]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA13:%.*]] unwind label [[TERMINATE_LPAD]] +; CHECK: return.sink.split: +; CHECK-NEXT: [[RETVAL_PH:%.*]] = phi i32 [ 42, [[CATCH1]] ], [ 23, [[CATCH2]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 42, [[ENTRY:%.*]] ], [ [[RETVAL_PH]], [[RETURN_SINK_SPLIT]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: check.za13: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR219:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i64 [[TPIDR219]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[RESTORE_ZA14:%.*]], label [[EH_RESUME]] +; CHECK: restore.za14: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL13_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[CATCH_FALLTHROUGH]] ], [ [[TMP7]], [[CHECK_ZA]] ], [ [[TMP8]], [[CHECK_ZA13]] ], [ [[TMP7]], [[RESTORE_ZA]] ], [ [[TMP8]], [[RESTORE_ZA14]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL13_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP9:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { ptr, i32 } [[TMP9]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP10]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %return unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr @overflow_error + catch ptr @runtime_error + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = extractvalue { ptr, i32 } %0, 1 + %3 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @overflow_error) + %matches = icmp eq i32 %2, %3 + br i1 %matches, label %catch1, label %catch.fallthrough + +catch1: + %4 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @bar_shared() + to label %return.sink.split unwind label %lpad3 + +catch.fallthrough: + %5 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches1 = icmp eq i32 %2, %5 + br i1 %matches1, label %catch2, label %eh.resume + +catch2: + %6 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @fizz_shared() + to label %return.sink.split unwind label %lpad2 + +lpad2: + %7 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +lpad3: + %8 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +return.sink.split: + %retval.ph = phi i32 [ 42, %catch1 ], [ 23, %catch2 ] + tail call void @__cxa_end_catch() + br label %return + +return: + %retval = phi i32 [ 42, %entry ], [ %retval.ph, %return.sink.split ] + ret i32 %retval + +eh.resume: + %lpad.val13.merged = phi { ptr, i32 } [ %0, %catch.fallthrough ], [ %7, %lpad2 ], [ %8, %lpad3 ] + resume { ptr, i32 } %lpad.val13.merged + +terminate.lpad: + %9 = landingpad { ptr, i32 } + catch ptr null + %10 = extractvalue { ptr, i32 } %9, 0 + tail call void @__clang_call_terminate(ptr %10) + unreachable +} + +; +; Try/Catch with throw in try block. Set up & restore lazy-save +; +; void try_throw() { +; try { +; throw 23; +; } catch (...) { +; foo(); +; } +; bar(); +; return; +; } + +define void @try_throw() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @try_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 4 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: tail call void @bar_shared() +; CHECK-NEXT: ret void +; CHECK: lpad1: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE7:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC8:%.*]] = trunc i64 [[LIVE7]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE9:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC8]], ptr [[TPIDR2_OBJ_LIVE9]], align 2 +; CHECK-NEXT: [[TPI_INT10:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT10]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA5:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za5: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR211:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP12:%.*]] = icmp eq i64 [[TPIDR211]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label [[RESTORE_ZA6:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za6: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @foo_shared() + to label %invoke.cont unwind label %lpad1 + +invoke.cont: + tail call void @__cxa_end_catch() + tail call void @bar_shared() + ret void + +lpad1: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; Try/Catch with throw in catch block. Set up & restore lazy-save +; +; void throw_except() { +; try { +; foo(); +; } catch (...) { +; throw 23; +; } +; bar(); +; return; +; } + +define void @catch_throw() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @catch_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 4 +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE7:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC8:%.*]] = trunc i64 [[LIVE7]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE9:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC8]], ptr [[TPIDR2_OBJ_LIVE9]], align 2 +; CHECK-NEXT: [[TPI_INT10:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT10]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA5:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @bar_shared() +; CHECK-NEXT: ret void +; CHECK: check.za5: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR211:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP12:%.*]] = icmp eq i64 [[TPIDR211]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label [[RESTORE_ZA6:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za6: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %try.cont unwind label %lpad1 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad2 + +lpad2: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +try.cont: + tail call void @bar_shared() + ret void + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; Nested Try/Catch. Set up & restore lazy-save +; +; void nested() { +; try { +; foo(); +; try { +; bar(); +; } catch (const std::runtime_error& e) { +; fizz(); +; } +; } +; catch (...) { +; buzz(); +; } +; } + +define void @nested() "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @nested( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @foo_shared() +; CHECK-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont1: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[TRY_CONT14:%.*]] unwind label [[LPAD2:%.*]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: br label [[CATCH2:%.*]] +; CHECK: lpad2: +; CHECK-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH1:%.*]], label [[CATCH2]] +; CHECK: catch1: +; CHECK-NEXT: [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD3:%.*]] +; CHECK: invoke.cont2: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[LIVE9:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC10:%.*]] = trunc i64 [[LIVE9]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE11:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC10]], ptr [[TPIDR2_OBJ_LIVE11]], align 2 +; CHECK-NEXT: [[TPI_INT12:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT12]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD4:%.*]] +; CHECK: lpad3: +; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[LIVE15:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC16:%.*]] = trunc i64 [[LIVE15]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE17:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC16]], ptr [[TPIDR2_OBJ_LIVE17]], align 2 +; CHECK-NEXT: [[TPI_INT18:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT18]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA13:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: lpad4: +; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: br label [[EH_CLEANUP:%.*]] +; CHECK: check.za13: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR219:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i64 [[TPIDR219]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[RESTORE_ZA14:%.*]], label [[EH_CLEANUP]] +; CHECK: restore.za14: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_CLEANUP]] +; CHECK: eh.cleanup: +; CHECK-NEXT: [[PN:%.*]] = phi { ptr, i32 } [ [[TMP8]], [[LPAD4]] ], [ [[TMP7]], [[CHECK_ZA13]] ], [ [[TMP7]], [[RESTORE_ZA14]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: [[EXN_SLOT_0:%.*]] = extractvalue { ptr, i32 } [[PN]], 0 +; CHECK-NEXT: br label [[CATCH2]] +; CHECK: catch2: +; CHECK-NEXT: [[EXN_SLOT_1:%.*]] = phi ptr [ [[EXN_SLOT_0]], [[EH_CLEANUP]] ], [ [[TMP3]], [[LPAD2]] ], [ [[TMP1]], [[LPAD1]] ] +; CHECK-NEXT: [[TMP9:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN_SLOT_1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE21:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC22:%.*]] = trunc i64 [[LIVE21]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE23:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC22]], ptr [[TPIDR2_OBJ_LIVE23]], align 2 +; CHECK-NEXT: [[TPI_INT24:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT24]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @buzz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD5:%.*]] +; CHECK: invoke.cont3: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT14]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[TRY_CONT14]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[TRY_CONT14]] +; CHECK: try.cont14: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: lpad5: +; CHECK-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE27:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC28:%.*]] = trunc i64 [[LIVE27]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE29:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC28]], ptr [[TPIDR2_OBJ_LIVE29]], align 2 +; CHECK-NEXT: [[TPI_INT30:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT30]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA25:%.*]] unwind label [[TERMINATE_LPAD]] +; CHECK: check.za25: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR231:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP32:%.*]] = icmp eq i64 [[TPIDR231]], 0 +; CHECK-NEXT: br i1 [[CMP32]], label [[RESTORE_ZA26:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za26: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP10]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP12]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @foo_shared() + to label %invoke.cont1 unwind label %lpad1 + +invoke.cont1: + invoke void @bar_shared() + to label %try.cont14 unwind label %lpad2 + +lpad1: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + br label %catch2 + +lpad2: + %2 = landingpad { ptr, i32 } + catch ptr @runtime_error + catch ptr null + %3 = extractvalue { ptr, i32 } %2, 0 + %4 = extractvalue { ptr, i32 } %2, 1 + %5 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches = icmp eq i32 %4, %5 + br i1 %matches, label %catch1, label %catch2 + +catch1: + %6 = tail call ptr @__cxa_begin_catch(ptr %3) + invoke void @fizz_shared() + to label %invoke.cont2 unwind label %lpad3 + +invoke.cont2: + invoke void @__cxa_end_catch() + to label %try.cont14 unwind label %lpad4 + +lpad3: + %7 = landingpad { ptr, i32 } + catch ptr null + invoke void @__cxa_end_catch() + to label %eh.cleanup unwind label %terminate.lpad + +lpad4: + %8 = landingpad { ptr, i32 } + catch ptr null + br label %eh.cleanup + +eh.cleanup: + %pn = phi { ptr, i32 } [ %8, %lpad4 ], [ %7, %lpad3 ] + %exn.slot.0 = extractvalue { ptr, i32 } %pn, 0 + br label %catch2 + +catch2: + %exn.slot.1 = phi ptr [ %exn.slot.0, %eh.cleanup ], [ %3, %lpad2 ], [ %1, %lpad1 ] + %9 = tail call ptr @__cxa_begin_catch(ptr %exn.slot.1) + invoke void @buzz_shared() + to label %invoke.cont3 unwind label %lpad5 + +invoke.cont3: + tail call void @__cxa_end_catch() + br label %try.cont14 + +try.cont14: + ret void + +lpad5: + %10 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %10 + +terminate.lpad: + %11 = landingpad { ptr, i32 } + catch ptr null + %12 = extractvalue { ptr, i32 } %11, 0 + tail call void @__clang_call_terminate(ptr %12) + unreachable +} + +; void try_condition(bool cond) { +; try { +; if (cond) { +; throw 23; +; } +; bar(); +; } +; catch (...) { +; fizz(); +; } +; } + +define void @conditional_throw(i1 %cond) "aarch64_pstate_za_shared" personality i32 0 { +; CHECK-LABEL: @conditional_throw( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) +; CHECK-NEXT: store i32 23, ptr [[EXCEPTION]], align 16 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @except, ptr null) +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP1]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont2: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT:%.*]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[TRY_CONT]] unwind label [[LPAD]] +; CHECK: lpad1: +; CHECK-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE11:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC12:%.*]] = trunc i64 [[LIVE11]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE13:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC12]], ptr [[TPIDR2_OBJ_LIVE13]], align 2 +; CHECK-NEXT: [[TPI_INT14:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT14]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA9:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za9: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR215:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP16:%.*]] = icmp eq i64 [[TPIDR215]], 0 +; CHECK-NEXT: br i1 [[CMP16]], label [[RESTORE_ZA10:%.*]], label [[EH_RESUME:%.*]] +; CHECK: restore.za10: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP3]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP4:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP4]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP5]]) +; CHECK-NEXT: unreachable +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[UNREACHABLE:%.*]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[UNREACHABLE]] +; CHECK: unreachable: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: unreachable +; +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %exception = tail call ptr @__cxa_allocate_exception(i64 4) + store i32 23, ptr %exception, align 16 + invoke void @__cxa_throw(ptr nonnull %exception, ptr nonnull @except, ptr null) + to label %unreachable unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr null + %1 = extractvalue { ptr, i32 } %0, 0 + %2 = tail call ptr @__cxa_begin_catch(ptr %1) + invoke void @fizz_shared() + to label %invoke.cont2 unwind label %lpad1 + +invoke.cont2: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void + +if.end: + invoke void @bar_shared() + to label %try.cont unwind label %lpad + +lpad1: + %3 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + resume { ptr, i32 } %3 + +terminate.lpad: + %4 = landingpad { ptr, i32 } + catch ptr null + %5 = extractvalue { ptr, i32 } %4, 0 + tail call void @__clang_call_terminate(ptr %5) + unreachable + +unreachable: + unreachable +} + +; +; int foo_throw(bool a) throw(const char *) { +; if (a) { +; bar(); +; return 42; +; } +; fizz(); +; return 23; +; } +; + +define i32 @throw_in_signature(i1 %a) "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @throw_in_signature( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: br i1 [[A:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @bar_shared() +; CHECK-NEXT: to label [[RETURN:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: filter [1 x ptr] [ptr @except] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[EHSPEC_FAILS:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[EHSPEC_FAILS]], label [[EHSPEC_UNEXPECTED:%.*]], label [[EH_RESUME:%.*]] +; CHECK: ehspec.unexpected: +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: tail call void @__cxa_call_unexpected(ptr [[TMP2]]) +; CHECK-NEXT: unreachable +; CHECK: if.end: +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[RETURN]] unwind label [[LPAD]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 42, [[IF_THEN]] ], [ 23, [[IF_END]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK: eh.resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[TMP0]] +; +entry: + br i1 %a, label %if.then, label %if.end + +if.then: + invoke void @bar_shared() + to label %return unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + filter [1 x ptr] [ptr @except] + %1 = extractvalue { ptr, i32 } %0, 1 + %ehspec.fails = icmp slt i32 %1, 0 + br i1 %ehspec.fails, label %ehspec.unexpected, label %eh.resume + +ehspec.unexpected: + %2 = extractvalue { ptr, i32 } %0, 0 + tail call void @__cxa_call_unexpected(ptr %2) #2 + unreachable + +if.end: + invoke void @fizz_shared() + to label %return unwind label %lpad + +return: + %retval = phi i32 [ 42, %if.then ], [ 23, %if.end ] + ret i32 %retval + +eh.resume: + resume { ptr, i32 } %0 +} + +; +; void try_func() try { +; fizz(); +; } +; catch (const std::runtime_error& e) { +; buzz(); +; } +; + +define void @try_func() "aarch64_pstate_za_shared" personality i32 1 { +; CHECK-LABEL: @try_func( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ:%.*]] = alloca { ptr, i16, [6 x i8] }, align 8 +; CHECK-NEXT: [[N:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: [[NN:%.*]] = mul i64 [[N]], [[N]] +; CHECK-NEXT: [[BUFFER:%.*]] = alloca i8, i64 [[NN]], align 16 +; CHECK-NEXT: [[TPIDR2_INVOKE_OBJ_BUFFER:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 0 +; CHECK-NEXT: store ptr [[BUFFER]], ptr [[TPIDR2_INVOKE_OBJ_BUFFER]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca i48, i48 0, align 16 +; CHECK-NEXT: [[RESERVED:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 2 +; CHECK-NEXT: store ptr [[ZERO]], ptr [[RESERVED]], align 8 +; CHECK-NEXT: [[LIVE:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC:%.*]] = trunc i64 [[LIVE]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC]], ptr [[TPIDR2_OBJ_LIVE]], align 2 +; CHECK-NEXT: [[TPI_INT:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @fizz_shared() +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]] +; CHECK: lpad: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr @runtime_error +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: [[LIVE1:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC2:%.*]] = trunc i64 [[LIVE1]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE3:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC2]], ptr [[TPIDR2_OBJ_LIVE3]], align 2 +; CHECK-NEXT: [[TPI_INT4:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT4]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: invoke void @buzz_shared() +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD1:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: tail call void @__cxa_end_catch() +; CHECK-NEXT: br label [[TRY_CONT]] +; CHECK: try.cont: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; CHECK: lpad1: +; CHECK-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[LIVE5:%.*]] = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: [[LIVE_TRUNC6:%.*]] = trunc i64 [[LIVE5]] to i16 +; CHECK-NEXT: [[TPIDR2_OBJ_LIVE7:%.*]] = getelementptr { ptr, i16, [6 x i8] }, ptr [[TPIDR2_INVOKE_OBJ]], i64 0, i32 1 +; CHECK-NEXT: store i16 [[LIVE_TRUNC6]], ptr [[TPIDR2_OBJ_LIVE7]], align 2 +; CHECK-NEXT: [[TPI_INT8:%.*]] = ptrtoint ptr [[TPIDR2_INVOKE_OBJ]] to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 [[TPI_INT8]]) +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +; CHECK-NEXT: invoke void @__cxa_end_catch() +; CHECK-NEXT: to label [[CHECK_ZA:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +; CHECK: check.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.za.enable() +; CHECK-NEXT: [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[TPIDR2]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RESTORE_ZA:%.*]], label [[EH_RESUME]] +; CHECK: restore.za: +; CHECK-NEXT: call aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_restore(ptr [[TPIDR2_INVOKE_OBJ]]) +; CHECK-NEXT: br label [[EH_RESUME]] +; CHECK: eh.resume: +; CHECK-NEXT: [[LPAD_VAL6_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP0]], [[LPAD]] ], [ [[TMP5]], [[CHECK_ZA]] ], [ [[TMP5]], [[RESTORE_ZA]] ] +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sme.za.disable() +; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL6_MERGED]] +; CHECK: terminate.lpad: +; CHECK-NEXT: [[TMP6:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP6]], 0 +; CHECK-NEXT: tail call void @__clang_call_terminate(ptr [[TMP7]]) +; CHECK-NEXT: unreachable +; +entry: + invoke void @fizz_shared() + to label %try.cont unwind label %lpad + +lpad: + %0 = landingpad { ptr, i32 } + catch ptr @runtime_error + %1 = extractvalue { ptr, i32 } %0, 1 + %2 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @runtime_error) + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %catch, label %eh.resume + +catch: + %3 = extractvalue { ptr, i32 } %0, 0 + %4 = tail call ptr @__cxa_begin_catch(ptr %3) #4 + invoke void @buzz_shared() + to label %invoke.cont unwind label %lpad1 + +invoke.cont: + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void + +lpad1: + %5 = landingpad { ptr, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: + %lpad.val6.merged = phi { ptr, i32 } [ %0, %lpad ], [ %5, %lpad1 ] + resume { ptr, i32 } %lpad.val6.merged + +terminate.lpad: + %6 = landingpad { ptr, i32 } + catch ptr null + %7 = extractvalue { ptr, i32 } %6, 0 + tail call void @__clang_call_terminate(ptr %7) + unreachable +} + +declare i32 @llvm.eh.typeid.for(ptr) +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() +declare ptr @__cxa_allocate_exception(i64) +declare void @__cxa_throw(ptr, ptr, ptr) +declare void @__cxa_call_unexpected(ptr) noreturn +declare void @__clang_call_terminate(ptr) noreturn + +; CHECK: attributes #0 = { "aarch64_pstate_za_shared" "target-features"="+sme" } +; CHECK: attributes #1 = { "aarch64_pstate_za_new" "target-features"="+sme" } +; CHECK: attributes #2 = { "target-features"="+sme" } +; CHECK: attributes #3 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_enabled" "aarch64_pstate_za_shared" "target-features"="+sme" } +; CHECK: attributes #4 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" "target-features"="+sme" } +; CHECK: attributes #5 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_shared" "target-features"="+sme" } +; CHECK: attributes #6 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_new" "target-features"="+sme" } +; CHECK: attributes #7 = { nounwind memory(none) "target-features"="+sme" } +; CHECK: attributes #8 = { noreturn "target-features"="+sme" } +; CHECK: attributes #9 = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #10 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #11 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" } +; CHECK: attributes #12 = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" } +; CHECK: attributes #13 = { "aarch64_pstate_za_preserved" } diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-get-live-za-slices.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define i64 @sme_get_live_slices() { +; CHECK-LABEL: sme_get_live_slices: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x0, #1 +; CHECK-NEXT: ret + %slices = call i64 @llvm.aarch64.sme.get.live.za.slices() + ret i64 %slices +} + +declare i64 @llvm.aarch64.sme.get.live.za.slices() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -329,6 +329,33 @@ ret void } +; Test that we can store to the stack as well +; (this requires adding these opcodes to AArch64InstrInfo::getMemOpInfo) +define void @test_memopinfo_sme_ldr() nounwind { +; CHECK-LABEL: test_memopinfo_sme_ldr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: addvl x9, x8, #15 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: addvl x8, x8, #16 +; CHECK-NEXT: ldr za[w12, 0], [x9] +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %ptr = alloca , i64 16, align 16 + %ptr15 = getelementptr , ptr %ptr, i64 15 + %ptr16 = getelementptr , ptr %ptr, i64 16 + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr15) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr16) + ret void +} + declare void @llvm.aarch64.sme.ld1b.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1h.horiz(, ptr, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -329,6 +329,33 @@ ret void } +; Test that we can store to the stack as well +; (this requires adding these opcodes to AArch64InstrInfo::getMemOpInfo) +define void @test_memopinfo_sme_str() nounwind { +; CHECK-LABEL: test_memopinfo_sme_str: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: addvl x9, x8, #15 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: addvl x8, x8, #16 +; CHECK-NEXT: str za[w12, 0], [x9] +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %ptr = alloca , i64 16, align 16 + %ptr15 = getelementptr , ptr %ptr, i64 15 + %ptr16 = getelementptr , ptr %ptr, i64 16 + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr15) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr16) + ret void +} + declare void @llvm.aarch64.sme.st1b.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1h.horiz(, ptr, i32, i32) declare void @llvm.aarch64.sme.st1w.horiz(, ptr, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll b/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-invoke-resume-pstatesm.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define void @unconditional_smstart() nounwind { +; CHECK-LABEL: unconditional_smstart: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 1) + ret void +} + +; FIXME: This case can be optimised away, since we know the condition +; will evaluate to false. +define void @conditional_smstart() nounwind { +; CHECK-LABEL: conditional_smstart: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: tbz xzr, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 0) + ret void +} + +define void @conditional_smstart_with_entry_val(i64 %pstate) nounwind { +; CHECK-LABEL: conditional_smstart_with_entry_val: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: tbz x0, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.invoke.resume.pstatesm(i64 %pstate) + ret void +} + +declare void @llvm.aarch64.sme.invoke.resume.pstatesm(i64) diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -11,14 +11,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -44,12 +45,13 @@ ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mul x19, x8, x8 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: sub x8, x8, x19 +; CHECK-NEXT: rdsvl x19, #1 +; CHECK-NEXT: msub x8, x19, x19, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh w19, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x20 @@ -89,14 +91,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za @@ -126,14 +129,15 @@ ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #80 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: stur wzr, [x29, #-68] +; CHECK-NEXT: sturh wzr, [x29, #-70] +; CHECK-NEXT: stur x8, [x29, #-80] +; CHECK-NEXT: sturh w9, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -stop-after=aarch64-sme-peephole-opt < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" { + ; CHECK-LABEL: name: streaming_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +define void @locally_streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_body" { + ; CHECK-LABEL: name: locally_streaming_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +; FIXME: This test no longer results in any conditional smstart/stop pairs being optimised after adding a check that the pstate.sm register must match. +define void @streaming_compatible_caller_normal_callee() nounwind "aarch64_pstate_sm_compatible" { + ; CHECK-LABEL: name: streaming_compatible_caller_normal_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[COPY]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri1:%[0-9]+]]:gpr64common = ANDXri [[COPY1]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri1]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri1]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri2:%[0-9]+]]:gpr64common = ANDXri [[COPY2]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri2]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri2]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + call void @normal_callee(); + call void @normal_callee(); + call void @normal_callee(); + ret void; +} + +define void @normal_caller_streaming_callee() nounwind { + ; CHECK-LABEL: name: normal_caller_streaming_callee + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv + ; CHECK-NEXT: RET_ReallyLR + call void @streaming_callee(); + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +define void @streaming_compatible_caller_mixed_callees() nounwind "aarch64_pstate_sm_compatible" { + ; CHECK-LABEL: name: streaming_compatible_caller_mixed_callees + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64common = ANDXri [[COPY]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri]], 1, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri]], 1, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri1:%[0-9]+]]:gpr64common = ANDXri [[COPY1]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri1]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri1]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri2:%[0-9]+]]:gpr64common = ANDXri [[COPY2]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri2]], 1, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @streaming_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri2]], 1, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ANDXri3:%[0-9]+]]:gpr64common = ANDXri [[COPY3]], 4096 + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[ANDXri3]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[ANDXri3]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + call void @streaming_callee(); + call void @normal_callee(); + call void @streaming_callee(); + call void @normal_callee(); + ret void; +} diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir b/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.mir @@ -0,0 +1,172 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -run-pass=aarch64-sme-peephole-opt -verify-machineinstrs %s -o - | FileCheck %s +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + declare void @normal_callee() #0 + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_match() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_regmask() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + ; Function Attrs: nounwind + define void @streaming_compatible_smstartstop_physical_regs() #1 { + call void @normal_callee() + call void @normal_callee() + ret void + } + + attributes #0 = { "target-features"="+sme" } + attributes #1 = { nounwind "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "target-features"="+sme" } + +... +--- +# smstart/smstop pair can be removed by aarch64-sme-peephole-opt as the virtual registers for pstate.sm match + +name: streaming_compatible_smstartstop_match +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_match + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %0:gpr64 = COPY $x0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + RET_ReallyLR +... +--- +# No smstart/smstop pairs should be removed by aarch64-sme-peephole-opt as the register masks do not match + +name: streaming_compatible_smstartstop_regmask +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_regmask + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, [[COPY]], 0, csr_aarch64_aapcs, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, [[COPY]], 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %0:gpr64 = COPY $x0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, %0, 0, csr_aarch64_aapcs, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, %0, 0, csr_aarch64_smstartstop + RET_ReallyLR +... +--- +# No smstart/smstop pairs should be removed by aarch64-sme-peephole-opt as the value of pstate.sm is held in a physical register + +name: streaming_compatible_smstartstop_physical_regs +alignment: 4 +tracksRegLiveness: true +liveins: [] +body: | + bb.0 (%ir-block.0): + ; CHECK-LABEL: name: streaming_compatible_smstartstop_physical_regs + ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + ; CHECK-NEXT: BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + BL &__arm_sme_state, csr_aarch64_sme_abi_support_routines_preservemost_from_x2, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $x0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 0, undef $x7, 0, csr_aarch64_smstartstop, implicit-def $sp + BL @normal_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + MSRpstatePseudo 1, 1, undef $x7, 0, csr_aarch64_smstartstop + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -0,0 +1,1559 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-unknown-eabi-elf" + +; This test verifies that call arguments and results are not coalesced +; with SVE vector registers by the coalescer, such that no 'mul vl' +; ldr/str pairs are generated in the streaming-mode-changing call +; sequence. + +; +; Scalar arguments +; + +define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: bl use_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i8 %arg, i32 0 + call void @use_i8(i8 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: bl use_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i16 %arg, i32 0 + call void @use_i16(i16 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: bl use_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i32 %arg, i32 0 + call void @use_i32(i32 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl use_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, i64 %arg, i32 0 + call void @use_i64(i64 %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, half %arg, i32 0 + call void @use_f16(half %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, float %arg, i32 0 + call void @use_f32(float %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = insertelement poison, double %arg, i32 0 + call void @use_f64(double %arg) + store %vec, ptr %ptr + ret void +} + + +; +; Single-element vector arguments +; + +define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i8> %arg, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + call void @use_v16i8(<1 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i16> %arg, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + call void @use_v8i16(<1 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i32> %arg, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + call void @use_v4i32(<1 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x i64> %arg, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + call void @use_v2i64(<1 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x half> %arg, i32 0 + %vec = insertelement poison, half %elt, i32 0 + call void @use_v8f16(<1 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x float> %arg, i32 0 + %vec = insertelement poison, float %elt, i32 0 + call void @use_v4f32(<1 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %elt = extractelement <1 x double> %arg, i32 0 + %vec = insertelement poison, double %elt, i32 0 + call void @use_v2f64(<1 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Full vector arguments +; + +define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v16i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %arg, i64 0) + call void @use_v16i8(<16 x i8> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %arg, i64 0) + call void @use_v8i16(<8 x i16> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %arg, i64 0) + call void @use_v4i32(<4 x i32> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %arg, i64 0) + call void @use_v2i64(<2 x i64> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v8f16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %arg, i64 0) + call void @use_v8f16(<8 x half> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v4f32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %arg, i64 0) + call void @use_v4f32(<4 x float> %arg) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_arg_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl use_v2f64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %arg, i64 0) + call void @use_v2f64(<2 x double> %arg) + store %vec, ptr %ptr + ret void +} + +; +; Scalar return values +; + +define void @dont_coalesce_res_i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i8 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i8 @get_i8() + %vec = insertelement poison, i8 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i16 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i16 @get_i16() + %vec = insertelement poison, i16 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i32 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i32 @get_i32() + %vec = insertelement poison, i32 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_i64 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @get_i64() + %vec = insertelement poison, i64 %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call half @get_f16() + %vec = insertelement poison, half %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f32 +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call float @get_f32() + %vec = insertelement poison, float %res, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call double @get_f64() + %vec = insertelement poison, double %res, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Single-element vector result values +; + +define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i8 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i8> @get_v1i8() + %elt = extractelement <1 x i8> %res, i32 0 + %vec = insertelement poison, i8 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i16 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i16> @get_v1i16() + %elt = extractelement <1 x i16> %res, i32 0 + %vec = insertelement poison, i16 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i32> @get_v1i32() + %elt = extractelement <1 x i32> %res, i32 0 + %vec = insertelement poison, i32 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1i64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x i64> @get_v1i64() + %elt = extractelement <1 x i64> %res, i32 0 + %vec = insertelement poison, i64 %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f16 +; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x half> @get_v1f16() + %elt = extractelement <1 x half> %res, i32 0 + %vec = insertelement poison, half %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f32 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x float> @get_v1f32() + %elt = extractelement <1 x float> %res, i32 0 + %vec = insertelement poison, float %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v1f64 +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <1 x double> @get_v1f64() + %elt = extractelement <1 x double> %res, i32 0 + %vec = insertelement poison, double %elt, i32 0 + store %vec, ptr %ptr + ret void +} + +; +; Full vector result values +; + +define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v16i8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <16 x i8> @get_v16i8() + %vec = call @llvm.vector.insert.nxv16i8.v16i8( poison, <16 x i8> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8i16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x i16> @get_v8i16() + %vec = call @llvm.vector.insert.nxv8i16.v8i16( poison, <8 x i16> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4i32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x i32> @get_v4i32() + %vec = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2i64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x i64> @get_v2i64() + %vec = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v8f16 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <8 x half> @get_v8f16() + %vec = call @llvm.vector.insert.nxv8f16.v8f16( poison, <8 x half> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v4f32 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <4 x float> @get_v4f32() + %vec = call @llvm.vector.insert.nxv4f32.v4f32( poison, <4 x float> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { +; CHECK-LABEL: dont_coalesce_res_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl get_v2f64 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x double> @get_v2f64() + %vec = call @llvm.vector.insert.nxv2f64.v2f64( poison, <2 x double> %res, i64 0) + store %vec, ptr %ptr + ret void +} + +declare half @get_f16() +declare float @get_f32() +declare double @get_f64() +declare <1 x half> @get_v1f16() +declare <1 x float> @get_v1f32() +declare <1 x double> @get_v1f64() +declare <8 x half> @get_v8f16() +declare <4 x float> @get_v4f32() +declare <2 x double> @get_v2f64() + +declare i8 @get_i8() +declare i16 @get_i16() +declare i32 @get_i32() +declare i64 @get_i64() +declare <1 x i8> @get_v1i8() +declare <1 x i16> @get_v1i16() +declare <1 x i32> @get_v1i32() +declare <2 x i64> @get_v1i64() +declare <16 x i8> @get_v16i8() +declare <8 x i16> @get_v8i16() +declare <4 x i32> @get_v4i32() +declare <2 x i64> @get_v2i64() + +declare void @use_f16(half) +declare void @use_f32(float) +declare void @use_f64(double) +declare void @use_v1f16(<1 x half>) +declare void @use_v1f32(<1 x float>) +declare void @use_v1f64(<1 x double>) +declare void @use_v8f16(<8 x half>) +declare void @use_v4f32(<4 x float>) +declare void @use_v2f64(<2 x double>) + +declare void @use_i8(i8) +declare void @use_i16(i16) +declare void @use_i32(i32) +declare void @use_i64(i64) +declare void @use_v1i8(<1 x i8>) +declare void @use_v1i16(<1 x i16>) +declare void @use_v1i32(<1 x i32>) +declare void @use_v1i64(<1 x i64>) +declare void @use_v16i8(<16 x i8>) +declare void @use_v8i16(<8 x i16>) +declare void @use_v4i32(<4 x i32>) +declare void @use_v2i64(<2 x i64>) + +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.vector.insert.nxv2f64.v2f64(, <2 x double>, i64) + +attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme" } diff --git a/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll b/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll --- a/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll +++ b/llvm/test/CodeGen/AArch64/sme-read-write-tpidr2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sme -verify-machineinstrs < %s | FileCheck %s define i64 @get_tpidr2_el0() { ; CHECK-LABEL: get_tpidr2_el0: diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-abi.ll @@ -0,0 +1,225 @@ +; RUN: opt -S -mtriple=aarch64-linux-gnu -opaque-pointers -aarch64-sme-abi %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-linux-gnu -opaque-pointers -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s +; Check that the test also passes when not in opaque-pointer mode. +; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s -o /dev/null + +; XFAIL: * + +declare void @private_za_callee(); +declare void @shared_za_callee() "aarch64_pstate_za_shared" +declare void @new_za_callee() "aarch64_pstate_za_new" +declare void @preserved_za_callee() "aarch64_pstate_za_preserved" +declare i64 @foo(i64); + +declare float @llvm.cos.f32(float) + +; CHECK: type { ptr, i16, [6 x i8] } + +; Shared ZA Caller, Private ZA Callee + +define void @shared_za_caller() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller() #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @private_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @private_za_callee() + ret void +} + +; Shared ZA Caller, Private ZA Callees + +define i64 @shared_za_caller_multiple_callees(i64 %a) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_multiple_callees(i64 %a) #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: %b = call i64 @foo(i64 %a) +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: %sum = add i64 %a, %b +; CHECK-NEXT: %live3 = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc4 = trunc i64 %live3 to i16 +; CHECK-NEXT: %tpidr2.obj.live5 = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc4, ptr %tpidr2.obj.live5, align 2 +; CHECK-NEXT: %tpi.int6 = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int6) +; CHECK-NEXT: %c = call i64 @foo(i64 %sum) +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr27 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp8 = icmp eq i64 %tpidr27, 0 +; CHECK-NEXT: br i1 %cmp8, label %restore.za2, label %resume1 +; CHECK: restore.za2: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume1 +; CHECK: resume1: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: %res = mul i64 %sum, %c +; CHECK-NEXT: ret i64 %res +; + %b = call i64 @foo(i64 %a) + %sum = add i64 %a, %b + %c = call i64 @foo(i64 %sum) + %res = mul i64 %sum, %c + ret i64 %res +} + +; Shared ZA Caller, New ZA Callee + +define void @shared_za_new_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_new_za_callee() #4 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @new_za_callee() + ret void +} + +define void @shared_za_streaming_compatible_caller_private_za_callee() "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_streaming_compatible_caller_private_za_callee() #5 { +; CHECK-NEXT: %tpidr2.call.obj = alloca %tpidr2_ty, align 8 +; CHECK-NEXT: %N = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: %NN = mul i64 %N, %N +; CHECK-NEXT: %buffer = alloca i8, i64 %NN, align 16 +; CHECK-NEXT: %tpidr2.call.obj.buffer = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 0 +; CHECK-NEXT: store ptr %buffer, ptr %tpidr2.call.obj.buffer, align 8 +; CHECK-NEXT: %live = call i64 @llvm.aarch64.sme.get.live.za.slices() +; CHECK-NEXT: %live.trunc = trunc i64 %live to i16 +; CHECK-NEXT: %tpidr2.obj.live = getelementptr %tpidr2_ty, ptr %tpidr2.call.obj, i64 0, i32 1 +; CHECK-NEXT: store i16 %live.trunc, ptr %tpidr2.obj.live, align 2 +; CHECK-NEXT: %tpi.int = ptrtoint ptr %tpidr2.call.obj to i64 +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 %tpi.int) +; CHECK-NEXT: call void @private_za_callee() +; CHECK-NEXT: call void @llvm.aarch64.sme.start.pstateza() +; CHECK-NEXT: %tpidr2 = call i64 @llvm.aarch64.sme.get.tpidr2() +; CHECK-NEXT: %cmp = icmp eq i64 %tpidr2, 0 +; CHECK-NEXT: br i1 %cmp, label %restore.za, label %resume +; CHECK: restore.za: +; CHECK-NEXT: call void @llvm.aarch64.sme.tpidr2.restore(ptr %tpidr2.call.obj) +; CHECK-NEXT: br label %resume +; CHECK: resume: +; CHECK-NEXT: call void @llvm.aarch64.sme.set.tpidr2(i64 0) +; CHECK-NEXT: ret void +; + call void @private_za_callee() + ret void +} + +; Shared ZA Caller, Shared ZA Callee (Lazy-save not required) + +define void @shared_za_shared_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_shared_za_callee() #0 { +; CHECK-NEXT: call void @shared_za_callee() +; CHECK-NEXT: ret void +; + call void @shared_za_callee() + ret void +} + +; Ensure we also check the attribute on the call itself (not just from the called function) +define void @shared_za_shared_za_callee_from_ptr(ptr %fnptr) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_shared_za_callee_from_ptr(ptr %fnptr) #0 { +; CHECK-NEXT: call void %fnptr() #0 +; CHECK-NEXT: ret void +; + call void %fnptr() "aarch64_pstate_za_shared" + ret void +} + +; Shared ZA Caller, Preserved ZA Callee (Lazy-save not required) + +define void @shared_za_caller_preserved_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_preserved_za_callee() #0 { +; CHECK-NEXT: call void @preserved_za_callee() +; CHECK-NEXT: ret void +; + call void @preserved_za_callee() + ret void +} + +; Shared ZA Caller with Intrinsic Call (Lazy-save not required) + +define float @shared_za_caller_with_intrinsic(ptr %a) "aarch64_pstate_za_shared" { +; CHECK-LABEL: define {{[^@]+}}@shared_za_caller_with_intrinsic(ptr %a) #0 { +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[A:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[A]]) +; CHECK-NEXT: ret float [[RES]] + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %a) + %res = load float, ptr %a + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %a) + ret float %res +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +;. +; CHECK: attributes #0 = { "aarch64_pstate_za_shared" } +; CHECK: attributes #1 = { "aarch64_pstate_za_new" } +; CHECK: attributes #2 = { "aarch64_pstate_za_preserved" } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn } +; CHECK: attributes #4 = { "aarch64_expanded_pstate_za" "aarch64_pstate_za_shared" } +; CHECK: attributes #5 = { "aarch64_expanded_pstate_za" "aarch64_pstate_sm_compatible" "aarch64_pstate_za_shared" } +; CHECK: attributes #6 = { argmemonly nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #7 = { nocallback nofree nosync nounwind readnone willreturn } +; CHECK: attributes #8 = { nocallback nofree nosync nounwind willreturn } +; CHECK: attributes #9 = { inaccessiblememonly nocallback nofree nosync nounwind readonly willreturn } +;. diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -10,14 +10,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -42,14 +43,15 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: msub x8, x9, x9, x8 +; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; @@ -237,25 +237,31 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: %0 = call fast double @llvm.cos.f64(double %x) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,30 +129,37 @@ ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz x19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl normal_callee_vec_arg -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz x19, #0, .LBB4_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s ; This file tests the following combinations related to streaming-enabled functions: ; [ ] N -> S (Normal -> Streaming) @@ -313,20 +313,19 @@ ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload ; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload -; CHECK-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll copy from llvm/test/CodeGen/AArch64/sme-streaming-body.ll copy to llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-locally-interface.ll @@ -3,7 +3,8 @@ declare void @normal_callee(); declare void @streaming_callee() "aarch64_pstate_sm_enabled"; -declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +; Streaming Caller, Locally Streaming Callee define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_streaming_callee: @@ -14,8 +15,8 @@ ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: bl streaming_compatible_callee -; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -24,21 +25,6 @@ ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret - call void @streaming_compatible_callee(); - call void @streaming_compatible_callee(); - ret void; -} - -; Test that a streaming body and streaming interface, no smstart/smstop are emitted, -; because the function already is in streaming mode upon entry. -define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { -; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret call void @streaming_callee(); call void @streaming_callee(); ret void; @@ -53,7 +39,7 @@ ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: cmp x0, #1 -; CHECK-NEXT: b.ne .LBB2_2 +; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.1: // %if.else ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload @@ -61,7 +47,7 @@ ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: .LBB1_2: // %if.end ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -94,9 +80,9 @@ ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: add v0.2d, v1.2d, v0.2d ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm @@ -112,27 +98,39 @@ ret <2 x i64> %add; } -; Test that we use the interface (not the function's body) to determine what -; streaming-mode to enter the callee. In this case the interface is normal, so -; pstate.sm must be 0 on entry and is 0 upon return from the callee. +define void @locally_streaming_caller_normal_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_normal_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + + call void @normal_callee(); + ret void; +} + +define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +; Locally Streaming Caller, Locally Streaming Callee + define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @locally_streaming_caller_streaming_callee(); @@ -175,32 +173,33 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible" -define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { +define <2 x i64> @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret -; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" - ret {<2 x i64>, <2 x i64>} %res; + %v1.res = extractvalue {<2 x i64>, <2 x i64>} %res, 1 + ret <2 x i64> %v1.res; } declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible" @@ -237,72 +236,35 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: ret -entry: - %0 = call fast double @llvm.cos.f64(double %x) - ret double %0 -} - -declare double @llvm.cos.f64(double) - - -define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" { -; CHECK-LABEL: test_arg_survives_loop: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB9_1: // %for.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subs w0, w0, #1 -; CHECK-NEXT: b.ne .LBB9_1 -; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: fmov s0, #1.00000000 -; CHECK-NEXT: ldr s1, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: fadd s0, s1, s0 -; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: bl cos +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: - br label %for.body - -for.body: - %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %inc = add nuw nsw i32 %i.02, 1 - %exitcond.not = icmp eq i32 %inc, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: - %add = fadd float %arg, 1.000000e+00 - ret float %add - + %0 = call fast double @llvm.cos.f64(double %x) + ret double %0 } + +declare double @llvm.cos.f64(double) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +; This function would normally scavenge a stackslot from the callee-save +; area, which would lead to spilling 's0' to that stackslot before the +; smstop and filling it with 'addvl + ' after the smstop because +; the frame-pointer is not available. +; This would not be valid, since the vector-length has changed so 'addvl' +; cannot be used. This is testing that the stackslot-scavenging is disabled +; when there are any streaming-mode-changing call-sequences in the +; function. +define void @test_no_stackslot_scavenging(float %f) #0 { +; CHECK-LABEL: test_no_stackslot_scavenging: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f +; CHECK-NEXT: smstart sm +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ptr = alloca + call void asm sideeffect "", "~{x24}"() nounwind + call void @use_f(float %f) + ret void +} + +declare void @use_f(float) + +attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming.ll b/llvm/test/CodeGen/AArch64/sme-streaming.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s + +; Streaming mode functions can use the full SME instruction set and the subset +; of SVE and NEON instructions that are legal in streaming mode. +; + +; CHECK: sclamp z0.b, z0.b, z0.b +define @streaming_compatible_sme( %x) { + %1 = call asm "sclamp $0.b, $0.b, $0.b", "=w,0"( %x) + ret %1 +} + +; CHECK: add z0.b, z0.b, z0.b +define @streaming_compatible_sve( %x) { + %1 = call asm "add $0.b, $0.b, $0.b", "=w,0"( %x) + ret %1 +} + +; CHECK: fmulx s0, s0, s0 +define float @streaming_compatible_neon(float %x) { + %1 = call float asm "fmulx ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) + ret float %1 +} diff --git a/llvm/test/CodeGen/AArch64/sme-tailcall.ll b/llvm/test/CodeGen/AArch64/sme-tailcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-tailcall.ll @@ -0,0 +1,194 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + + +; Caller is non-streaming mode + +define void @non_streaming_caller_to_streaming_callee() nounwind { +; CHECK-LABEL: non_streaming_caller_to_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK: smstart sm +; CHECK: bl streaming_callee +; CHECK: smstop sm +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @non_streaming_caller_to_streaming_compatible_callee() nounwind { +; CHECK-LABEL: non_streaming_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +; Caller is streaming mode + +define void @streaming_caller_to_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK-NEXT: b streaming_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @streaming_caller_to_non_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_non_streaming_callee: +; CHECK: // %bb.0: // %entry +; CHECK: smstop sm +; CHECK: bl normal_callee +; CHECK: smstart sm +entry: + tail call void @normal_callee() + ret void +} + +define void @streaming_caller_to_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +; Caller is streaming compatible mode + +define void @streaming_compatible_caller_to_streaming_callee() nounwind "aarch64_pstate_sm_compatible"{ +; CHECK-LABEL: streaming_compatible_caller_to_streaming_callee: +; CHECK: // %bb.1: // %entry +; CHECK: smstart sm +; CHECK: bl streaming_callee +; CHECK: smstop sm +entry: + tail call void @streaming_callee() "aarch64_pstate_sm_enabled" + ret void +} + +define void @streaming_compatible_caller_to_non_streaming_callee() nounwind "aarch64_pstate_sm_compatible"{ +; CHECK-LABEL: streaming_compatible_caller_to_non_streaming_callee: +; CHECK: // %bb.1: // %entry +; CHECK: smstop sm +; CHECK: bl normal_callee +; CHECK: smstart sm +entry: + tail call void @normal_callee() + ret void +} + +define void @streaming_compatible_caller_to_streaming_compatible_callee() nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: streaming_compatible_caller_to_streaming_compatible_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b streaming_compatible_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + ret void +} + +declare void @za_new_callee() "aarch64_pstate_za_new"; +declare void @za_shared_callee() "aarch64_pstate_za_shared"; +declare void @za_preserved_callee() "aarch64_pstate_za_preserved"; + +; Caller with ZA state new + +define void @za_new_caller_to_za_new_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_new_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_new_callee +; CHECK: smstart za +; CHECK: bl __arm_tpidr2_restore +; CHECK: smstop za +entry: + tail call void @za_new_callee() "aarch64_pstate_za_new"; + ret void; +} + + +define void @za_new_caller_to_za_shared_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_shared_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_shared_callee +; CHECK: smstop za +entry: + tail call void @za_shared_callee() "aarch64_pstate_za_shared"; + ret void; +} + +define void @za_new_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_new" { +; CHECK-LABEL: za_new_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %prelude +; CHECK: bl __arm_tpidr2_save +; CHECK: smstart za +; CHECK: bl za_preserved_callee +; CHECK: smstop za +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} + + +; Caller with ZA state shared + +define void @za_shared_caller_to_za_new_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_new_callee: +; CHECK: // %bb.0: // %entry +; CHECK: bl za_new_callee +; CHECK-NEXT: smstart za +; CHECK: bl __arm_tpidr2_restore +entry: + tail call void @za_new_callee() "aarch64_pstate_za_new"; + ret void; +} + +define void @za_shared_caller_to_za_shared_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_shared_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b za_shared_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_shared_callee() "aarch64_pstate_za_shared"; + ret void; +} + +define void @za_shared_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_shared" { +; CHECK-LABEL: za_shared_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK: b za_preserved_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} + +; Caller with ZA state preserved + +define void @za_preserved_caller_to_za_preserved_callee() nounwind "aarch64_pstate_za_preserved" { +; CHECK-LABEL: za_preserved_caller_to_za_preserved_callee: +; CHECK: // %bb.0: // %entry +; CHECK-NOT: {{smstart|smstop}} +; CHECK-NEXT: b za_preserved_callee +; CHECK-NOT: {{smstart|smstop}} +entry: + tail call void @za_preserved_callee() "aarch64_pstate_za_preserved"; + ret void; +} diff --git a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll --- a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll +++ b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s define void @toggle_pstate_za() { ; CHECK-LABEL: toggle_pstate_za: diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll b/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-zt0-preserve.ll @@ -0,0 +1,514 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=SME +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=SME2 + +; Normal callee, no ZA state +declare void @normal_callee(); + +; Callees with ZA state +declare void @za_shared_callee() "aarch64_pstate_za_shared"; +declare void @za_new_callee() "aarch64_pstate_za_new"; + +; Callee with preserved ZA state +declare void @za_preserved_callee() "aarch64_pstate_za_preserved"; + + +define void @za_new_caller_normal_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_normal_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB0_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB0_2: +; SME-NEXT: smstart za +; SME-NEXT: sub x8, x29, #16 +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x8 +; SME-NEXT: bl normal_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB0_4 +; SME-NEXT: // %bb.3: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB0_4: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_normal_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #80 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB0_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB0_2: +; SME2-NEXT: smstart za +; SME2-NEXT: sub x8, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x8 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl normal_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB0_4 +; SME2-NEXT: // %bb.3: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB0_4: +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + ret void; +} + +define void @za_new_caller_za_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_za_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB1_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB1_2: +; SME-NEXT: smstart za +; SME-NEXT: sub x8, x29, #16 +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x8 +; SME-NEXT: bl za_new_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB1_4 +; SME-NEXT: // %bb.3: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB1_4: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: bl za_shared_callee +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_za_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #144 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB1_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB1_2: +; SME2-NEXT: smstart za +; SME2-NEXT: sub x8, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x8 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB1_4 +; SME2-NEXT: // %bb.3: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB1_4: +; SME2-NEXT: sub x19, x29, #144 +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_shared_caller_normal_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_normal_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: sub x10, x29, #16 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x10 +; SME-NEXT: bl normal_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB2_2 +; SME-NEXT: // %bb.1: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB2_2: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_normal_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #80 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: sub x10, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x10 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl normal_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB2_2 +; SME2-NEXT: // %bb.1: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB2_2: +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + ret void; +} + +define void @za_shared_caller_za_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_za_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: sub x10, x29, #16 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: sturh w9, [x29, #-8] +; SME-NEXT: msr TPIDR2_EL0, x10 +; SME-NEXT: bl za_new_callee +; SME-NEXT: smstart za +; SME-NEXT: sub x0, x29, #16 +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbnz x8, .LBB3_2 +; SME-NEXT: // %bb.1: +; SME-NEXT: bl __arm_tpidr2_restore +; SME-NEXT: .LBB3_2: +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: bl za_shared_callee +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_za_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #144 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: sub x10, x29, #16 +; SME2-NEXT: sub x19, x29, #80 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: sturh w9, [x29, #-8] +; SME2-NEXT: msr TPIDR2_EL0, x10 +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: smstart za +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: sub x0, x29, #16 +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbnz x8, .LBB3_2 +; SME2-NEXT: // %bb.1: +; SME2-NEXT: bl __arm_tpidr2_restore +; SME2-NEXT: .LBB3_2: +; SME2-NEXT: sub x19, x29, #144 +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: str zt0, [x19] +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr zt0, [x19] +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_new_caller_za_preserved_callee() "aarch64_pstate_za_new" nounwind { +; SME-LABEL: za_new_caller_za_preserved_callee: +; SME: // %bb.0: // %prelude +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: mrs x8, TPIDR2_EL0 +; SME-NEXT: cbz x8, .LBB4_2 +; SME-NEXT: // %bb.1: // %save.za +; SME-NEXT: bl __arm_tpidr2_save +; SME-NEXT: msr TPIDR2_EL0, xzr +; SME-NEXT: .LBB4_2: +; SME-NEXT: smstart za +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: smstop za +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_new_caller_za_preserved_callee: +; SME2: // %bb.0: // %prelude +; SME2-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #16 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: mrs x8, TPIDR2_EL0 +; SME2-NEXT: cbz x8, .LBB4_2 +; SME2-NEXT: // %bb.1: // %save.za +; SME2-NEXT: bl __arm_tpidr2_save +; SME2-NEXT: msr TPIDR2_EL0, xzr +; SME2-NEXT: .LBB4_2: +; SME2-NEXT: smstart za +; SME2-NEXT: zero { zt0 } +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: smstop za +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} + +define void @za_shared_caller_za_preserved_callee() "aarch64_pstate_za_shared" nounwind { +; SME-LABEL: za_shared_caller_za_preserved_callee: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_shared_caller_za_preserved_callee: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #16 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} + +define void @za_preserved_caller_za_callee() "aarch64_pstate_za_preserved" nounwind { +; SME-LABEL: za_preserved_caller_za_callee: +; SME: // %bb.0: +; SME-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME-NEXT: bl normal_callee +; SME-NEXT: bl za_new_callee +; SME-NEXT: bl za_shared_callee +; SME-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_preserved_caller_za_callee: +; SME2: // %bb.0: +; SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: bl normal_callee +; SME2-NEXT: bl za_new_callee +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret + call void @normal_callee(); + call void @za_new_callee(); + call void @za_shared_callee(); + ret void; +} + +define void @za_preserved_caller_za_preserved_callee() "aarch64_pstate_za_preserved" nounwind { +; SME-LABEL: za_preserved_caller_za_preserved_callee: +; SME: // %bb.0: +; SME-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME-NEXT: bl za_preserved_callee +; SME-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: za_preserved_caller_za_preserved_callee: +; SME2: // %bb.0: +; SME2-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SME2-NEXT: bl za_preserved_callee +; SME2-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SME2-NEXT: ret + call void @za_preserved_callee(); + ret void; +} + +define i32 @spill_fill_zt_load_start_chain(ptr %ptr) "aarch64_pstate_za_shared" { +; SME-LABEL: spill_fill_zt_load_start_chain: +; SME: // %bb.0: +; SME-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; SME-NEXT: mov x29, sp +; SME-NEXT: sub sp, sp, #16 +; SME-NEXT: .cfi_def_cfa w29, 32 +; SME-NEXT: .cfi_offset w19, -16 +; SME-NEXT: .cfi_offset w30, -24 +; SME-NEXT: .cfi_offset w29, -32 +; SME-NEXT: mov x8, sp +; SME-NEXT: rdsvl x9, #1 +; SME-NEXT: msub x8, x9, x9, x8 +; SME-NEXT: mov sp, x8 +; SME-NEXT: stur wzr, [x29, #-4] +; SME-NEXT: sturh wzr, [x29, #-6] +; SME-NEXT: stur x8, [x29, #-16] +; SME-NEXT: ldr w19, [x0] +; SME-NEXT: bl za_shared_callee +; SME-NEXT: mov w0, w19 +; SME-NEXT: mov sp, x29 +; SME-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; SME-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME-NEXT: ret +; +; SME2-LABEL: spill_fill_zt_load_start_chain: +; SME2: // %bb.0: +; SME2-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; SME2-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; SME2-NEXT: mov x29, sp +; SME2-NEXT: sub sp, sp, #80 +; SME2-NEXT: .cfi_def_cfa w29, 32 +; SME2-NEXT: .cfi_offset w19, -8 +; SME2-NEXT: .cfi_offset w20, -16 +; SME2-NEXT: .cfi_offset w30, -24 +; SME2-NEXT: .cfi_offset w29, -32 +; SME2-NEXT: mov x8, sp +; SME2-NEXT: rdsvl x9, #1 +; SME2-NEXT: msub x8, x9, x9, x8 +; SME2-NEXT: mov sp, x8 +; SME2-NEXT: stur wzr, [x29, #-4] +; SME2-NEXT: sub x20, x29, #80 +; SME2-NEXT: sturh wzr, [x29, #-6] +; SME2-NEXT: stur x8, [x29, #-16] +; SME2-NEXT: ldr w19, [x0] +; SME2-NEXT: str zt0, [x20] +; SME2-NEXT: bl za_shared_callee +; SME2-NEXT: ldr zt0, [x20] +; SME2-NEXT: mov w0, w19 +; SME2-NEXT: mov sp, x29 +; SME2-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; SME2-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; SME2-NEXT: ret + %loadval = load i32, ptr %ptr + call void @za_shared_callee() + ret i32 %loadval +} diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll @@ -90,9 +90,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7h.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 7, i32 %slice.0) + ret { , } %res2 } define { , } @za_read_horiz_vg2_f64(i32 %slice) { @@ -100,9 +103,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7h.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 7, i32 %slice.0) + ret { , } %res2 } ; Vertical @@ -190,9 +196,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7v.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 7, i32 %slice.0) + ret { , } %res2 } define { , } @za_read_vert_vg2_f64(i32 %slice) { @@ -200,9 +209,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: mov { z0.d, z1.d }, za7v.d[w12, 0:1] ; CHECK-NEXT: ret %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice) - ret { , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 7, i32 %slice.0) + ret { , } %res2 } ; @@ -268,9 +280,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3h.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_f32(i32 %slice) { @@ -278,9 +293,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3h.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_d(i32 %slice) { @@ -288,9 +306,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7h.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 7, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_horiz_vg4_f64(i32 %slice) { @@ -298,9 +319,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7h.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 7, i32 %slice.0) + ret { , , , } %res2 } ; Vertical @@ -362,9 +386,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3v.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_f32(i32 %slice) { @@ -372,9 +399,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: mov { z0.s - z3.s }, za3v.s[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 3, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_d(i32 %slice) { @@ -382,9 +412,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7v.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 7, i32 %slice.0) + ret { , , , } %res2 } define { , , , } @za_read_vert_vg4_f64(i32 %slice) { @@ -392,9 +425,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: mov { z0.d - z3.d }, za7v.d[w12, 0:3] ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice) - ret { , , , } %res + %slice.0 = add i32 %slice, 0 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 7, i32 %slice.0) + ret { , , , } %res2 } ; Move Multi-Vector From ZA (Read) x2 diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, } @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.b, z1.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + ret {, } %res +} + +define {, } @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, %x, i32 7) + ret {, } %res +} + +define {, } @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, %x, i32 7) + ret {, }%res +} + +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32, , i32) +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, , , } @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.b - z3.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, %x, i32 0) + ret {, , , } %res +} + +define {, , , } @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.h - z3.h }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, %x, i32 3) + ret {, , , } %res +} + +define {, , , } @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 { z0.s - z3.s }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, %x, i32 3) + ret {, , , }%res +} + +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define @luti2_i8( %x) { +; CHECK-LABEL: luti2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.b, zt0, z0[0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, %x, i32 0) + ret %res +} + +define @luti2_i16( %x) { +; CHECK-LABEL: luti2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, zt0, z0[15] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, %x, i32 15) + ret %res +} + +define @luti2_i32( %x) { +; CHECK-LABEL: luti2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.s, zt0, z0[15] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, %x, i32 15) + ret %res +} + +declare @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32, , i32) +declare @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32, , i32) +declare @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, } @luti4_i8( %x) { +; CHECK-LABEL: luti4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.b, z1.b }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + ret {, } %res +} + +define {, } @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, %x, i32 3) + ret {, } %res +} + +define {, } @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, %x, i32 3) + ret {, } %res +} + +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32, , i32) +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define {, , , } @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, %x, i32 0) + ret {, , , } %res +} + +define {, , , } @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, %x, i32 1) + ret {, , , } %res +} + +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +; lookup table expand one register + +define @luti4_i8( %x) { +; CHECK-LABEL: luti4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.b, zt0, z0[0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, %x, i32 0) + ret %res +} + +define @luti4_i16( %x) { +; CHECK-LABEL: luti4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, zt0, z0[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, %x, i32 7) + ret %res +} + +define @luti4_i32( %x) { +; CHECK-LABEL: luti4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.s, zt0, z0[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, %x, i32 7) + ret %res +} + +declare @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32, , i32) +declare @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32, , i32) +declare @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32, , i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +define void @zero_zt0() { +; CHECK-LABEL: zero_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: zero { zt0 } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.zero.zt(i32 0) + ret void +} + +declare void @llvm.aarch64.sme.zero.zt(i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; LDR + +define void @ldr_zt0(ptr %ptr) { +; CHECK-LABEL: ldr_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr zt0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptr) + ret void; +} + +; STR + +define void @str_zt0(ptr %ptr) { +; CHECK-LABEL: str_zt0: +; CHECK: // %bb.0: +; CHECK-NEXT: str zt0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str.zt(i32 0, ptr %ptr) + ret void; +} + +declare void @llvm.aarch64.sme.ldr.zt(i32, ptr) +declare void @llvm.aarch64.sme.str.zt(i32, ptr) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti2-lane-x2.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti2-lane-x2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti2-lane-x2.ll @@ -0,0 +1,510 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS + +; lookup table expand one register +define @luti2_i8_z0_z7( %x) { +; STRIDED-LABEL: luti2_i8_z0_z7: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti2 { z0.b, z8.b }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti2_i8_z0_z7: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 56 * VG +; CONTIGUOUS-NEXT: luti2 { z0.b, z1.b }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @luti2_i8_z0_z4_z8_z12( %x) { +; STRIDED-LABEL: luti2_i8_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti2 { z0.b, z4.b, z8.b, z12.b }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z3.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti2_i8_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-14 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: luti2 { z0.b - z3.b }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: mov z3.d, z1.d +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #14 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 16) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 32) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 48) + ret %v3 +} + +define @luti2_i16_z0_z7( %x) { +; STRIDED-LABEL: luti2_i16_z0_z7: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti2 { z0.h, z8.h }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti2_i16_z0_z7: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 56 * VG +; CONTIGUOUS-NEXT: luti2 { z0.h, z1.h }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @luti2_i16_z0_z4_z8_z12( %x) { +; STRIDED-LABEL: luti2_i16_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti2 { z0.h, z4.h, z8.h, z12.h }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z3.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti2_i16_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-14 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: luti2 { z0.h - z3.h }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: mov z3.d, z1.d +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #14 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv16i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv16i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv16i16( poison, %res.v0, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv16i16( %v0, %res.v1, i64 24) + ret %v3 +} + +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32, , i32) +declare @llvm.vector.insert.nxv64i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv16i16(, , i64) +declare @llvm.vector.insert.nxv16i16.nxv8i16(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti4-lane.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti4-lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-luti4-lane.ll @@ -0,0 +1,511 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS + +; lookup table expand one register +define @luti4_i8_z0_z7( %x) { +; STRIDED-LABEL: luti4_i8_z0_z7: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti4 { z0.b, z8.b }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti4_i8_z0_z7: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 56 * VG +; CONTIGUOUS-NEXT: luti4 { z0.b, z1.b }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @luti4_i16_z0_z7( %x) { +; STRIDED-LABEL: luti4_i16_z0_z7: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti4 { z0.h, z8.h }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti4_i16_z0_z7: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 56 * VG +; CONTIGUOUS-NEXT: luti4 { z0.h, z1.h }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @luti4_i16_z0_z4_z8_z12( %x) { +; STRIDED-LABEL: luti4_i16_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti4 { z0.h, z4.h, z8.h, z12.h }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z3.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti4_i16_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-14 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: mov z3.d, z1.d +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #14 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv16i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv16i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv16i16( poison, %res.v0, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv16i16( %v0, %res.v1, i64 24) + ret %v3 +} + +define @luti4_i8_z0_z4_z8_z12( %x) { +; STRIDED-LABEL: luti4_i8_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-16 +; STRIDED-NEXT: str z23, [sp] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; STRIDED-NEXT: .cfi_offset w29, -16 +; STRIDED-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; STRIDED-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; STRIDED-NEXT: luti4 { z0.s, z4.s, z8.s, z12.s }, zt0, z0[0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z3.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #16 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: luti4_i8_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-14 +; CONTIGUOUS-NEXT: str z23, [sp] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CONTIGUOUS-NEXT: .cfi_offset w29, -16 +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 8 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 24 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 32 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG +; CONTIGUOUS-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 48 * VG +; CONTIGUOUS-NEXT: luti4 { z0.s - z3.s }, zt0, z0[0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: mov z3.d, z1.d +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #14 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, %x, i32 0) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i32.nxv16i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i32.nxv16i32( %v0, %res.v1, i64 4) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv16i32.nxv16i32( poison, %res.v0, i64 8) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv16i32.nxv16i32( %v0, %res.v1, i64 12) + ret %v3 +} + +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32, , i32) +declare {, } @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, , i32) +declare {, , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, , i32) +declare @llvm.vector.insert.nxv64i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv16i16(, , i64) +declare @llvm.vector.insert.nxv16i16.nxv8i16(, , i64) +declare @llvm.vector.insert.nxv16i32.nxv16i32(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-max.ll @@ -0,0 +1,78 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +; BFMAX (Single, x2) + +define { , } @multi_vec_max_single_x2_bf16( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_max_single_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmax { z0.h, z1.h }, { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmax.single.x2.nxv8bf16( %zdn1, %zdn2, %zm) + ret { , } %res +} + +; BFMAX (Single, x4) + +define { , , , } +@multi_vec_max_single_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_max_single_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmax { z0.h - z3.h }, { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmax.single.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +; BFMAX (Multi, x2) + +define { , } @multi_vec_max_multi_x2_bf16( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_max_multi_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: bfmax { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmax.x2.nxv8bf16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; BFMAX (Multi, x4) + +define { , , , } +@multi_vec_max_multi_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_max_multi_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: bfmax { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmax.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +declare { , } @llvm.aarch64.sve.bfmax.single.x2.nxv8bf16(, , ) + +declare { , , , } + @llvm.aarch64.sve.bfmax.single.x4.nxv8bf16(, , , , ) + +declare { , } @llvm.aarch64.sve.bfmax.x2.nxv8bf16(, , , ) + +declare { , , , } + @llvm.aarch64.sve.bfmax.x4.nxv8bf16(, , , , , , , ) diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-min.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +; BFMIN (Single, x2) + +define { , } @multi_vec_min_single_x2_bf16( %zdn1, %zdn2, %zm) { +; CHECK-LABEL: multi_vec_min_single_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmin { z0.h, z1.h }, { z0.h, z1.h }, z2.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmin.single.x2.nxv8bf16( %zdn1, %zdn2, %zm) + ret { , } %res +} + +; BFMIN (Single, x4) + +define { , , , } +@multi_vec_min_single_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +; CHECK-LABEL: multi_vec_min_single_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfmin { z0.h - z3.h }, { z0.h - z3.h }, z4.h +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmin.single.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm) + ret { , , , } %res +} + +; BFMIN (Multi, x2) + +define { , } @multi_vec_min_multi_x2_bf16( %zdn1, %zdn2, %zm1, %zm2) { +; CHECK-LABEL: multi_vec_min_multi_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3 +; CHECK-NEXT: bfmin { z0.h, z1.h }, { z0.h, z1.h }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.bfmin.x2.nxv8bf16( %zdn1, %zdn2, %zm1, %zm2) + ret { , } %res +} + +; BFMIN (Multi, x4) + +define { , , , } +@multi_vec_min_multi_x4_bf16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +; CHECK-LABEL: multi_vec_min_multi_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7 +; CHECK-NEXT: bfmin { z0.h - z3.h }, { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } + @llvm.aarch64.sve.bfmin.x4.nxv8bf16( %zdn1, %zdn2, %zdn3, %zdn4, + %zm1, %zm2, %zm3, %zm4) + ret { , , , } %res +} + +declare { , } @llvm.aarch64.sve.bfmin.single.x2.nxv8bf16(, , ) + +declare { , , , } + @llvm.aarch64.sve.bfmin.single.x4.nxv8bf16(, , , , ) + +declare { , } @llvm.aarch64.sve.bfmin.x2.nxv8bf16(, , , ) + +declare { , , , } + @llvm.aarch64.sve.bfmin.x4.nxv8bf16(, , , , , , , ) diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir --- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir @@ -9,8 +9,10 @@ define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable } + define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable } define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable } + define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable } attributes #0 = { nounwind "target-features"="+sve" } @@ -131,6 +133,43 @@ RET_ReallyLR ... --- +name: spills_fills_stack_id_zpr2strided +tracksRegLiveness: true +registers: + - { id: 0, class: zpr2strided } +stack: +liveins: + - { reg: '$z0_z8', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $z0_z8 + + ; CHECK-LABEL: name: spills_fills_stack_id_zpr2strided + ; CHECK: stack: + ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16 + ; CHECK-NEXT: stack-id: scalable-vector + + ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2strided + ; EXPAND: STR_ZXI $z0, $sp, 0 + ; EXPAND: STR_ZXI $z8, $sp, 1 + ; EXPAND: $z0 = LDR_ZXI $sp, 0 + ; EXPAND: $z8 = LDR_ZXI $sp, 1 + + %0:zpr2strided = COPY $z0_z8 + + $z0_z1_z2_z3 = IMPLICIT_DEF + $z4_z5_z6_z7 = IMPLICIT_DEF + $z8_z9_z10_z11 = IMPLICIT_DEF + $z12_z13_z14_z15 = IMPLICIT_DEF + $z16_z17_z18_z19 = IMPLICIT_DEF + $z20_z21_z22_z23 = IMPLICIT_DEF + $z24_z25_z26_z27 = IMPLICIT_DEF + $z28_z29_z30_z31 = IMPLICIT_DEF + + $z0_z8 = COPY %0 + RET_ReallyLR +... +--- name: spills_fills_stack_id_zpr3 tracksRegLiveness: true registers: @@ -210,3 +249,44 @@ $z0_z1_z2_z3 = COPY %0 RET_ReallyLR ... +--- +name: spills_fills_stack_id_zpr4strided +tracksRegLiveness: true +registers: + - { id: 0, class: zpr4strided } +stack: +liveins: + - { reg: '$z0_z4_z8_z12', virtual-reg: '%0' } +body: | + bb.0.entry: + liveins: $z0_z4_z8_z12 + + ; CHECK-LABEL: name: spills_fills_stack_id_zpr4strided + ; CHECK: stack: + ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16 + ; CHECK-NEXT: stack-id: scalable-vector + + ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4strided + ; EXPAND: STR_ZXI $z0, $sp, 0 + ; EXPAND: STR_ZXI $z4, $sp, 1 + ; EXPAND: STR_ZXI $z8, $sp, 2 + ; EXPAND: STR_ZXI $z12, $sp, 3 + ; EXPAND: $z0 = LDR_ZXI $sp, 0 + ; EXPAND: $z4 = LDR_ZXI $sp, 1 + ; EXPAND: $z8 = LDR_ZXI $sp, 2 + ; EXPAND: $z12 = LDR_ZXI $sp, 3 + + %0:zpr4strided = COPY $z0_z4_z8_z12 + + $z0_z1_z2_z3 = IMPLICIT_DEF + $z4_z5_z6_z7 = IMPLICIT_DEF + $z8_z9_z10_z11 = IMPLICIT_DEF + $z12_z13_z14_z15 = IMPLICIT_DEF + $z16_z17_z18_z19 = IMPLICIT_DEF + $z20_z21_z22_z23 = IMPLICIT_DEF + $z24_z25_z26_z27 = IMPLICIT_DEF + $z28_z29_z30_z31 = IMPLICIT_DEF + + $z0_z4_z8_z12 = COPY %0 + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll b/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/streaming-mode-no-reorder.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +; Test SMSTART and SMSTOP are marked as scheduling barriers and are not +; rescheduled around function calls. + +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; + +@arr = global [8 x i8] zeroinitializer, align 2 + +define void @clear_arr() { +; CHECK-LABEL: clear_arr: +; CHECK: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm + call void @streaming_callee() + call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr, i64 0, i64 0), i8 0, i64 8, i1 false) + ret void; +} + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1) diff --git a/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir b/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir --- a/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir +++ b/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir @@ -23,6 +23,29 @@ $z0_z1 = COPY $z1_z2 RET_ReallyLR +... +--- +name: copy_zpr2strided +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$z0_z1' } +frameInfo: + maxCallFrameSize: 0 +body: | + bb.0: + liveins: $z0_z1 + ; CHECK-LABEL: name: copy_zpr2strided + ; CHECK: liveins: $z0_z1 + ; CHECK: $z8 = ORR_ZZZ $z1, $z1 + ; CHECK: $z0 = ORR_ZZZ $z0, $z0 + ; CHECK: $z1 = ORR_ZZZ $z8, $z8 + ; CHECK: $z0 = ORR_ZZZ $z0, $z0 + ; CHECK: RET_ReallyLR + $z0_z8 = COPY $z0_z1 + $z0_z1 = COPY $z0_z8 + RET_ReallyLR + ... --- name: copy_zpr3 @@ -76,3 +99,30 @@ RET_ReallyLR ... +--- +name: copy_zpr4strided +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$z0_z1_z2_z3' } +frameInfo: + maxCallFrameSize: 0 +body: | + bb.0: + liveins: $z0_z1_z2_z3 + ; CHECK-LABEL: name: copy_zpr4 + ; CHECK: liveins: $z0_z1_z2_z3 + ; CHECK: $z12 = ORR_ZZZ $z3, $z3 + ; CHECK: $z8 = ORR_ZZZ $z2, $z2 + ; CHECK: $z4 = ORR_ZZZ $z1, $z1 + ; CHECK: $z0 = ORR_ZZZ $z0, $z0 + ; CHECK: $z3 = ORR_ZZZ $z12, $z12 + ; CHECK: $z2 = ORR_ZZZ $z8, $z8 + ; CHECK: $z1 = ORR_ZZZ $z4, $z4 + ; CHECK: $z0 = ORR_ZZZ $z0, $z0 + ; CHECK: RET_ReallyLR + $z0_z4_z8_z12 = COPY $z0_z1_z2_z3 + $z0_z1_z2_z3 = COPY $z0_z4_z8_z12 + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -1122,3 +1122,201 @@ store <4 x double> %res, ptr %b ret void } + +define half @scvtf_i16_f16(ptr %0) { +; CHECK-LABEL: scvtf_i16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to half + ret half %3 +} + +define float @scvtf_i16_f32(ptr %0) { +; CHECK-LABEL: scvtf_i16_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to float + ret float %3 +} + +define double @scvtf_i16_f64(ptr %0) { +; CHECK-LABEL: scvtf_i16_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = sitofp i16 %2 to double + ret double %3 +} + +define half @scvtf_i32_f16(ptr %0) { +; CHECK-LABEL: scvtf_i32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to half + ret half %3 +} + +define float @scvtf_i32_f32(ptr %0) { +; CHECK-LABEL: scvtf_i32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to float + ret float %3 +} + +define double @scvtf_i32_f64(ptr %0) { +; CHECK-LABEL: scvtf_i32_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: scvtf d0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = sitofp i32 %2 to double + ret double %3 +} + +define half @scvtf_i64_f16(ptr %0) { +; CHECK-LABEL: scvtf_i64_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf h0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to half + ret half %3 +} + +define float @scvtf_i64_f32(ptr %0) { +; CHECK-LABEL: scvtf_i64_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to float + ret float %3 +} + +define double @scvtf_i64_f64(ptr %0) { +; CHECK-LABEL: scvtf_i64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = sitofp i64 %2 to double + ret double %3 +} + +define half @ucvtf_i16_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to half + ret half %3 +} + +define float @ucvtf_i16_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ucvtf s0, s0 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to float + ret float %3 +} + +define double @ucvtf_i16_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i16_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ret + %2 = load i16, ptr %0, align 64 + %3 = uitofp i16 %2 to double + ret double %3 +} + +define half @ucvtf_i32_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ucvtf h0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to half + ret half %3 +} + +define float @ucvtf_i32_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to float + ret float %3 +} + +define double @ucvtf_i32_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i32_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ret + %2 = load i32, ptr %0, align 64 + %3 = uitofp i32 %2 to double + ret double %3 +} + +define half @ucvtf_i64_f16(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf h0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to half + ret half %3 +} + +define float @ucvtf_i64_f32(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to float + ret float %3 +} + +define double @ucvtf_i64_f64(ptr %0) { +; CHECK-LABEL: ucvtf_i64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ucvtf d0, x8 +; CHECK-NEXT: ret + %2 = load i64, ptr %0, align 64 + %3 = uitofp i64 %2 to double + ret double %3 +} diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-reduce.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 < %s | FileCheck %s + +; +; FMAXNMQV +; + +define <8 x half> @fmaxnmqv_f16( %pg, %a) { +; CHECK-LABEL: fmaxnmqv_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxnmqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.sve.fmaxnmqv.v8f16.nxv8f16( %pg, + %a) + ret <8 x half> %res +} + +define <4 x float> @fmaxnmqv_f32( %pg, %a) { +; CHECK-LABEL: fmaxnmqv_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxnmqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.sve.fmaxnmqv.v4f32.nxv4f32( %pg, + %a) + ret <4 x float> %res +} + +define <2 x double> @fmaxnmqv_f64( %pg, %a) { +; CHECK-LABEL: fmaxnmqv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxnmqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.aarch64.sve.fmaxnmqv.v2f64.nxv2f64( %pg, + %a) + ret <2 x double> %res +} + +; +; FMINNMQV +; + +define <8 x half> @fminnmqv_f16( %pg, %a) { +; CHECK-LABEL: fminnmqv_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fminnmqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.sve.fminnmqv.v8f16.nxv8f16( %pg, + %a) + ret <8 x half> %res +} + +define <4 x float> @fminnmqv_f32( %pg, %a) { +; CHECK-LABEL: fminnmqv_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fminnmqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.sve.fminnmqv.v4f32.nxv4f32( %pg, + %a) + ret <4 x float> %res +} + +define <2 x double> @fminnmqv_f64( %pg, %a) { +; CHECK-LABEL: fminnmqv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fminnmqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.aarch64.sve.fminnmqv.v2f64.nxv2f64( %pg, + %a) + ret <2 x double> %res +} + +; +; FADDQV +; + +define <8 x half> @faddqv_f16( %pg, %a) { +; CHECK-LABEL: faddqv_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: faddqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16( %pg, + %a) + ret <8 x half> %res +} + +define <4 x float> @faddqv_f32( %pg, %a) { +; CHECK-LABEL: faddqv_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: faddqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32( %pg, + %a) + ret <4 x float> %res +} + +define <2 x double> @faddqv_f64( %pg, %a) { +; CHECK-LABEL: faddqv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: faddqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64( %pg, + %a) + ret <2 x double> %res +} + +; +; FMINQV +; + +define <8 x half> @fminqv_f16( %pg, %a) { +; CHECK-LABEL: fminqv_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fminqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.sve.fminqv.v8f16.nxv8f16( %pg, + %a) + ret <8 x half> %res +} + +define <4 x float> @fminqv_f32( %pg, %a) { +; CHECK-LABEL: fminqv_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fminqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.sve.fminqv.v4f32.nxv4f32( %pg, + %a) + ret <4 x float> %res +} + +define <2 x double> @fminqv_f64( %pg, %a) { +; CHECK-LABEL: fminqv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fminqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.aarch64.sve.fminqv.v2f64.nxv2f64( %pg, + %a) + ret <2 x double> %res +} + +; +; FMAXQV +; + +define <8 x half> @fmaxqv_f16( %pg, %a) { +; CHECK-LABEL: fmaxqv_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.sve.fmaxqv.v8f16.nxv8f16( %pg, + %a) + ret <8 x half> %res +} + +define <4 x float> @fmaxqv_f32( %pg, %a) { +; CHECK-LABEL: fmaxqv_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.sve.fmaxqv.v4f32.nxv4f32( %pg, + %a) + ret <4 x float> %res +} + +define <2 x double> @fmaxqv_f64( %pg, %a) { +; CHECK-LABEL: fmaxqv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x double> @llvm.aarch64.sve.fmaxqv.v2f64.nxv2f64( %pg, + %a) + ret <2 x double> %res +} + +declare <8 x half> @llvm.aarch64.sve.fmaxnmqv.v8f16.nxv8f16(, ) +declare <4 x float> @llvm.aarch64.sve.fmaxnmqv.v4f32.nxv4f32(, ) +declare <2 x double> @llvm.aarch64.sve.fmaxnmqv.v2f64.nxv2f64(, ) +declare <8 x half> @llvm.aarch64.sve.fminnmqv.v8f16.nxv8f16(, ) +declare <4 x float> @llvm.aarch64.sve.fminnmqv.v4f32.nxv4f32(, ) +declare <2 x double> @llvm.aarch64.sve.fminnmqv.v2f64.nxv2f64(, ) +declare <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16(, ) +declare <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32(, ) +declare <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64(, ) +declare <8 x half> @llvm.aarch64.sve.fminqv.v8f16.nxv8f16(, ) +declare <4 x float> @llvm.aarch64.sve.fminqv.v4f32.nxv4f32(, ) +declare <2 x double> @llvm.aarch64.sve.fminqv.v2f64.nxv2f64(, ) +declare <8 x half> @llvm.aarch64.sve.fmaxqv.v8f16.nxv8f16(, ) +declare <4 x float> @llvm.aarch64.sve.fmaxqv.v4f32.nxv4f32(, ) +declare <2 x double> @llvm.aarch64.sve.fmaxqv.v2f64.nxv2f64(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-reduce.ll @@ -0,0 +1,356 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sme2p1 < %s | FileCheck %s + +; +; ORQV +; + +define <16 x i8> @orqv_i8( %pg, %a) { +; CHECK-LABEL: orqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: orqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @orqv_i16( %pg, %a) { +; CHECK-LABEL: orqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: orqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @orqv_i32( %pg, %a) { +; CHECK-LABEL: orqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: orqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @orqv_i64( %pg, %a) { +; CHECK-LABEL: orqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: orqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; EORQV +; + +define <16 x i8> @eorqv_i8( %pg, %a) { +; CHECK-LABEL: eorqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eorqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @eorqv_i16( %pg, %a) { +; CHECK-LABEL: eorqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: eorqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @eorqv_i32( %pg, %a) { +; CHECK-LABEL: eorqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eorqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @eorqv_i64( %pg, %a) { +; CHECK-LABEL: eorqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: eorqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; ANDQV +; + +define <16 x i8> @andqv_i8( %pg, %a) { +; CHECK-LABEL: andqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: andqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @andqv_i16( %pg, %a) { +; CHECK-LABEL: andqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: andqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @andqv_i32( %pg, %a) { +; CHECK-LABEL: andqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: andqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @andqv_i64( %pg, %a) { +; CHECK-LABEL: andqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: andqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; ADDQV +; + +define <16 x i8> @addqv_i8( %pg, %a) { +; CHECK-LABEL: addqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: addqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @addqv_i16( %pg, %a) { +; CHECK-LABEL: addqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: addqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @addqv_i32( %pg, %a) { +; CHECK-LABEL: addqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: addqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @addqv_i64( %pg, %a) { +; CHECK-LABEL: addqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: addqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; SMAXQV +; + +define <16 x i8> @smaxqv_i8( %pg, %a) { +; CHECK-LABEL: smaxqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: smaxqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.smaxqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @smaxqv_i16( %pg, %a) { +; CHECK-LABEL: smaxqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: smaxqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.smaxqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @smaxqv_i32( %pg, %a) { +; CHECK-LABEL: smaxqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: smaxqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.smaxqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @smaxqv_i64( %pg, %a) { +; CHECK-LABEL: smaxqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: smaxqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.smaxqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; UMAXQV +; + +define <16 x i8> @umaxqv_i8( %pg, %a) { +; CHECK-LABEL: umaxqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: umaxqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.umaxqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @umaxqv_i16( %pg, %a) { +; CHECK-LABEL: umaxqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umaxqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.umaxqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @umaxqv_i32( %pg, %a) { +; CHECK-LABEL: umaxqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umaxqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.umaxqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @umaxqv_i64( %pg, %a) { +; CHECK-LABEL: umaxqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umaxqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.umaxqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; SMINQV +; + +define <16 x i8> @sminqv_i8( %pg, %a) { +; CHECK-LABEL: sminqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sminqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.sminqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @sminqv_i16( %pg, %a) { +; CHECK-LABEL: sminqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sminqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.sminqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @sminqv_i32( %pg, %a) { +; CHECK-LABEL: sminqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sminqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.sminqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @sminqv_i64( %pg, %a) { +; CHECK-LABEL: sminqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sminqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.sminqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +; +; UMINQV +; + +define <16 x i8> @uminqv_i8( %pg, %a) { +; CHECK-LABEL: uminqv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uminqv v0.16b, p0, z0.b +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.sve.uminqv.v16i8.nxv16i8( %pg, %a); + ret <16 x i8> %res +} + +define <8 x i16> @uminqv_i16( %pg, %a) { +; CHECK-LABEL: uminqv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uminqv v0.8h, p0, z0.h +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.aarch64.sve.uminqv.v8i16.nxv8i16( %pg, %a); + ret <8 x i16> %res +} + +define <4 x i32> @uminqv_i32( %pg, %a) { +; CHECK-LABEL: uminqv_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uminqv v0.4s, p0, z0.s +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.aarch64.sve.uminqv.v4i32.nxv4i32( %pg, %a); + ret <4 x i32> %res +} + +define <2 x i64> @uminqv_i64( %pg, %a) { +; CHECK-LABEL: uminqv_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uminqv v0.2d, p0, z0.d +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.aarch64.sve.uminqv.v2i64.nxv2i64( %pg, %a); + ret <2 x i64> %res +} + +declare <16 x i8> @llvm.aarch64.sve.orqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.orqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.orqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.eorqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.eorqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.eorqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.andqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.andqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.andqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.andqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.addqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.addqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.addqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.smaxqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.smaxqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.smaxqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.smaxqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.umaxqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.umaxqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.umaxqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.umaxqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.sminqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.sminqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.sminqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.sminqv.v2i64.nxv2i64(, ) +declare <16 x i8> @llvm.aarch64.sve.uminqv.v16i8.nxv16i8(, ) +declare <8 x i16> @llvm.aarch64.sve.uminqv.v8i16.nxv8i16(, ) +declare <4 x i32> @llvm.aarch64.sve.uminqv.v4i32.nxv4i32(, ) +declare <2 x i64> @llvm.aarch64.sve.uminqv.v2i64.nxv2i64(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfadd_pred( %pg, %a, %b){ +; CHECK-LABEL: bfadd_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fadd.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfadd_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfadd_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fadd.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfadd_u( %pg, %a, %b){ +; CHECK-LABEL: bfadd_u: +; CHECK: // %bb.0: +; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fadd.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfadd_u_ptrue( %a, %b){ +; CHECK-LABEL: bfadd_u_ptrue: +; CHECK: // %bb.0: +; CHECK-NEXT: bfadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fadd.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfadd_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfadd_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fadd.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fadd.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fadd.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfclamp( %a, %b, %c){ +; CHECK-LABEL: bfclamp: +; CHECK: // %bb.0: +; CHECK-NEXT: bfclamp z0.h, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fclamp.nxv8bf16( %a, %b, %c) + ret %res +} + +declare @llvm.aarch64.sve.fclamp.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfmax_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmax_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmax.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmax( %a, %b){ +; CHECK-LABEL: bfmax: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmax.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmax_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmax_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmax.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfmax_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmax_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmax.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmax_u( %a, %b){ +; CHECK-LABEL: bfmax_u: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmax.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmax_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmax_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmax.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fmax.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fmax.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfmaxnm_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmaxnm_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmaxnm.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmaxnm( %a, %b){ +; CHECK-LABEL: bfmaxnm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmaxnm.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmaxnm_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmaxnm_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmaxnm.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfmaxnm_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmaxnm_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmaxnm_u( %a, %b){ +; CHECK-LABEL: bfmaxnm_u: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmaxnm_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmaxnm_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmaxnm.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fmaxnm.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfmin_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmin_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmin.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmin( %a, %b){ +; CHECK-LABEL: bfmin: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmin.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmin_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmin_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmin.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfmin_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmin_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmin.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmin_u( %a, %b){ +; CHECK-LABEL: bfmin_u: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmin.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmin_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmin_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmin.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fmin.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fmin.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfminnm_pred( %pg, %a, %b){ +; CHECK-LABEL: bfminnm_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fminnm.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfminnm( %a, %b){ +; CHECK-LABEL: bfminnm: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fminnm.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfminnm_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfminnm_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fminnm.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfminnm_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfminnm_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fminnm.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfminnm_u( %a, %b){ +; CHECK-LABEL: bfminnm_u: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fminnm.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfminnm_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfminnm_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fminnm.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fminnm.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fminnm.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmla_m( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmla_m: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmla.nxv8bf16( %pg, %a, %b, %c) + ret %res +} + +define @bfmla_x( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmla_x: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmla.u.nxv8bf16( %pg, %a, %b, %c) + ret %res +} + +define @bfmla_z( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmla_z: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h +; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %res = call @llvm.aarch64.sve.fmla.nxv8bf16( %pg, %a_z, %b, %c) + ret %res +} + +declare @llvm.aarch64.sve.fmla.nxv8bf16(, , , ) +declare @llvm.aarch64.sve.fmla.u.nxv8bf16(, , , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmla_lane_idx1( %a, %b, %c){ +; CHECK-LABEL: bfmla_lane_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[1] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmla.lane.nxv8bf16( %a, %b, %c, i32 1) + ret %res +} + +define @bfmla_lane_idx3( %a, %b, %c){ +; CHECK-LABEL: bfmla_lane_idx3: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[3] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmla.lane.nxv8bf16( %a, %b, %c, i32 3) + ret %res +} + +define @bfmla_lane_idx7( %a, %b, %c){ +; CHECK-LABEL: bfmla_lane_idx7: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmla.lane.nxv8bf16( %a, %b, %c, i32 7) + ret %res +} + +declare @llvm.aarch64.sve.fmla.lane.nxv8bf16(, , , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmls_m( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmls_m: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmls.nxv8bf16( %pg, %a, %b, %c) + ret %res +} + +define @bfmls_x( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmls_x: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmls.u.nxv8bf16( %pg, %a, %b, %c) + ret %res +} + + +define @bfmls_z( %pg, %a, %b, %c){ +; CHECK-LABEL: bfmls_z: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h +; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %res = call @llvm.aarch64.sve.fmls.nxv8bf16( %pg, %a_z, %b, %c) + ret %res +} + +declare @llvm.aarch64.sve.fmls.nxv8bf16(, , , ) +declare @llvm.aarch64.sve.fmls.u.nxv8bf16(, , , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmls_lane_idx1( %a, %b, %c){ +; CHECK-LABEL: bfmls_lane_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[1] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmls.lane.nxv8bf16( %a, %b, %c, i32 1) + ret %res +} + +define @bfmls_lane_idx3( %a, %b, %c){ +; CHECK-LABEL: bfmls_lane_idx3: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[3] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmls.lane.nxv8bf16( %a, %b, %c, i32 3) + ret %res +} + +define @bfmls_lane_idx7( %a, %b, %c){ +; CHECK-LABEL: bfmls_lane_idx7: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[7] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmls.lane.nxv8bf16( %a, %b, %c, i32 7) + ret %res +} + +declare @llvm.aarch64.sve.fmls.lane.nxv8bf16(, , , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmlslb_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslb_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslb( %zda, %zn, %zm) + ret %out +} + +define @bfmlslt_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslt_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslt( %zda, %zn, %zm) + ret %out +} + +define @bfmlslb_lane_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslb_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslb.lane( %zda, %zn, %zm, i32 7) + ret %out +} + +define @bfmlslt_lane_f32( %zda, %zn, %zm) { +; CHECK-LABEL: bfmlslt_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.bfmlslt.lane( %zda, %zn, %zm, i32 7) + ret %out +} + +declare @llvm.aarch64.sve.bfmlslb(, , ) +declare @llvm.aarch64.sve.bfmlslt(, , ) +declare @llvm.aarch64.sve.bfmlslb.lane(, , , i32) +declare @llvm.aarch64.sve.bfmlslt.lane(, , , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfmul_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmul_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmul.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmul_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmul_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmul.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfmul_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfmul_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fmul.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfmul_u( %a, %b){ +; CHECK-LABEL: bfmul_u: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fmul.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfmul_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfmul_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fmul.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fmul.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fmul.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s + +define @bfmul_lane_idx1( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx1: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[1] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 1) + ret %out +} + +define @bfmul_lane_idx3( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx3: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[3] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 3) + ret %out +} + +define @bfmul_lane_idx7( %a, %b) { +; CHECK-LABEL: bfmul_lane_idx7: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[7] +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fmul.lane.nxv8bf16( %a, + %b, + i32 7) + ret %out +} + +declare @llvm.aarch64.sve.fmul.lane.nxv8bf16(, , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define @bfsub_pred( %pg, %a, %b){ +; CHECK-LABEL: bfsub_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fsub.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfsub_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfsub_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsub.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +define @bfsub_u_pred( %pg, %a, %b){ +; CHECK-LABEL: bfsub_u_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fsub.u.nxv8bf16( %pg, %a, %b) + ret %res +} + +define @bfsub_u( %a, %b){ +; CHECK-LABEL: bfsub_u: +; CHECK: // %bb.0: +; CHECK-NEXT: bfsub z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %elt = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %res = call @llvm.aarch64.sve.fsub.u.nxv8bf16( %elt, %a, %b) + ret %res +} + +define @bfsub_u_zeroing( %pg, %a, %b) { +; CHECK-LABEL: bfsub_u_zeroing: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.fsub.u.nxv8bf16( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.fsub.nxv8bf16(, , ) +declare @llvm.aarch64.sve.fsub.u.nxv8bf16(, , ) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_extq_i8 ( %zn, %zm) { +; CHECK-LABEL: test_extq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #0 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv16i8( %zn, %zm, i32 0) + ret %res +} + +define @test_extq_i16 ( %zn, %zm) { +; CHECK-LABEL: test_extq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #1 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv8i16( %zn, %zm, i32 1) + ret %res +} + +define @test_extq_i32 ( %zn, %zm) { +; CHECK-LABEL: test_extq_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #2 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv4i32( %zn, %zm, i32 2) + ret %res +} + +define @test_extq_i64 ( %zn, %zm) { +; CHECK-LABEL: test_extq_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #3 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv2i64( %zn, %zm, i32 3) + ret %res +} + +define @test_extq_f16( %zn, %zm) { +; CHECK-LABEL: test_extq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #4 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv8f16( %zn, %zm, i32 4) + ret %res +} + +define @test_extq_f32( %zn, %zm) { +; CHECK-LABEL: test_extq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #5 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv4f32( %zn, %zm, i32 5) + ret %res +} + +define @test_extq_f64( %zn, %zm) { +; CHECK-LABEL: test_extq_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #6 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv2f64( %zn, %zm, i32 6) + ret %res +} + +define @test_extq_bf16( %zn, %zm) { +; CHECK-LABEL: test_extq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: extq z0.b, z0.b, z1.b, #15 +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.extq.lane.nxv8bf16( %zn, %zm, i32 15) + ret %res +} + +declare @llvm.aarch64.sve.extq.lane.nxv16i8(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv8i16(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv4i32(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv2i64(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv8f16(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv4f32(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv2f64(, , i32) +declare @llvm.aarch64.sve.extq.lane.nxv8bf16(, , i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-fclamp.ll @@ -3,8 +3,8 @@ target triple = "aarch64-linux-gnu" -define @test_fclamp_f16( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f16: +define @test_fclamp_i16( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret @@ -12,8 +12,8 @@ ret %res } -define @test_fclamp_f32( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f32: +define @test_fclamp_i32( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret @@ -21,8 +21,8 @@ ret %res } -define @test_fclamp_f64( %a, %b, %c) #0 { -; CHECK-LABEL: test_fclamp_f64: +define @test_fclamp_i64( %a, %b, %c) #0 { +; CHECK-LABEL: test_fclamp_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: fclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret @@ -30,8 +30,8 @@ ret %res } -define { , } @test_fclamp_single_x2_f16( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f16: +define { , } @test_fclamp_single_x2_i16( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -41,8 +41,8 @@ ret { , } %res } -define { , } @test_fclamp_single_x2_f32( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f32: +define { , } @test_fclamp_single_x2_i32( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -52,8 +52,8 @@ ret { , } %res } -define { , } @test_fclamp_single_x2_f64( %a, %b, %c, %d) #1 { -; CHECK-LABEL: test_fclamp_single_x2_f64: +define { , } @test_fclamp_single_x2_i64( %a, %b, %c, %d) #1 { +; CHECK-LABEL: test_fclamp_single_x2_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 @@ -64,8 +64,8 @@ } -define { , , , } @test_fclamp_single_x4_f16( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f16: +define { , , , } @test_fclamp_single_x4_i16( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 @@ -77,8 +77,8 @@ ret { , , , } %res } -define { , , , } @test_fclamp_single_x4_f32( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f32: +define { , , , } @test_fclamp_single_x4_i32( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 @@ -90,8 +90,8 @@ ret { , , , } %res } -define { , , , } @test_fclamp_single_x4_f64( %a, %b, %c, %d, %e, %f) #1 { -; CHECK-LABEL: test_fclamp_single_x4_f64: +define { , , , } @test_fclamp_single_x4_i64( %a, %b, %c, %d, %e, %f) #1 { +; CHECK-LABEL: test_fclamp_single_x4_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-gather-loads-128bit-unscaled-offset.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; +; LD1Q: vector base + unscaled offset +; e.g. ld1q { z0.q }, p0/z, [z0.d, x0] +; +define @ld1q_gather_u64base_i8( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_i16( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_i32( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_i64( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_f16( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_f32( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + + +define @ld1q_gather_u64base_f64( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @ld1q_gather_u64base_bf16( %pg, %base, i64 %offset) { +; CHECK-LABEL: ld1q_gather_u64base_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1q { z0.q }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv16i8.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4i32.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2i64.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8f16.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv4f32.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv2f64.nxv2i64(, , i64) +declare @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8bf16.nxv2i64(, , i64) + diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; LD1W + +define @test_svld1uwq_i32_ss( %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svld1uwq_i32_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %gep = getelementptr i32, ptr %base, i64 %offset + %res = call @llvm.aarch64.sve.ld1uwq.nxv4i32( %pred, ptr %gep) + ret %res +} + +define @test_svld1uwq_i32_si( %pred, * %base) { +; CHECK-LABEL: test_svld1uwq_i32_si: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: ld1w { z1.q }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + %res1 = call @llvm.aarch64.sve.ld1uwq.nxv4i32( %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + %res2 = call @llvm.aarch64.sve.ld1uwq.nxv4i32( %pred, ptr %gep2) + + %res = add %res1, %res2 + ret %res +} + +define @test_svld1uwq_i32_out_of_bound( %pred, * %base) { +; CHECK-LABEL: test_svld1uwq_i32_out_of_bound: +; CHECK: // %bb.0: +; CHECK-NEXT: addvl x8, x0, #2 +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr inbounds , * %base, i64 8 + %res = call @llvm.aarch64.sve.ld1uwq.nxv4i32( %pred, ptr %gep) + + ret %res +} + +define @test_svld1uwq_f32_ss( %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svld1uwq_f32_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %gep = getelementptr float, ptr %base, i64 %offset + %res = call @llvm.aarch64.sve.ld1uwq.nxv4f32( %pred, ptr %gep) + ret %res +} + +define @test_svld1uwq_f32_si( %pred, * %base) { +; CHECK-LABEL: test_svld1uwq_f32_si: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: ld1w { z1.q }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + %res1 = call @llvm.aarch64.sve.ld1uwq.nxv4f32( %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + %res2 = call @llvm.aarch64.sve.ld1uwq.nxv4f32( %pred, ptr %gep2) + + %res = fadd %res1, %res2 + ret %res +} + +; LD1D + +define @test_svld1udq_i64_ss( %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svld1udq_i64_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %gep = getelementptr i64, ptr %base, i64 %offset + %res = call @llvm.aarch64.sve.ld1udq.nxv2i64( %pred, ptr %gep) + ret %res +} + +define @test_svld1udq_i64_si( %pred, * %base) { +; CHECK-LABEL: test_svld1udq_i64_si: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: ld1d { z1.q }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + %res1 = call @llvm.aarch64.sve.ld1udq.nxv2i64( %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + %res2 = call @llvm.aarch64.sve.ld1udq.nxv2i64( %pred, ptr %gep2) + + %res = add %res1, %res2 + ret %res +} + +define @test_svld1udq_i64_out_of_bound( %pred, * %base) { +; CHECK-LABEL: test_svld1udq_i64_out_of_bound: +; CHECK: // %bb.0: +; CHECK-NEXT: addvl x8, x0, #-5 +; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8] +; CHECK-NEXT: ret + %gep = getelementptr inbounds , * %base, i64 -10 + %res = call @llvm.aarch64.sve.ld1udq.nxv2i64( %pred, ptr %gep) + + ret %res +} + +define @test_svld1udq_f64_ss( %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svld1udq_f64_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %gep = getelementptr double, ptr %base, i64 %offset + %res = call @llvm.aarch64.sve.ld1udq.nxv2f64( %pred, ptr %gep) + ret %res +} + +define @test_svld1udq_f64_si( %pred, * %base) { +; CHECK-LABEL: test_svld1udq_f64_si: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: ld1d { z1.q }, p0/z, [x0, #7, mul vl] +; CHECK-NEXT: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + %res1 = call @llvm.aarch64.sve.ld1udq.nxv2f64( %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + %res2 = call @llvm.aarch64.sve.ld1udq.nxv2f64( %pred, ptr %gep2) + + %res = fadd %res1, %res2 + ret %res +} + +declare @llvm.aarch64.sve.ld1uwq.nxv4i32(, ptr) +declare @llvm.aarch64.sve.ld1uwq.nxv4f32(, ptr) + +declare @llvm.aarch64.sve.ld1udq.nxv2i64(, ptr) +declare @llvm.aarch64.sve.ld1udq.nxv2f64(, ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-loads.ll @@ -0,0 +1,797 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +;;LD2Q + +define { , } @ld2q_si_i8_off16( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_i8_off16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to i8 * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_si_i8_off14( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_i8_off14: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 14 + %base_ptr = bitcast * %base to i8 * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_i8( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_i8( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_i16( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to i16 * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_i16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_i16( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_i32( %pg, ptr %addr ) { +; CHECK-LABEL: ld2q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , ptr %addr, i64 -16 + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( %pg, ptr %base); + ret { , } %res +} + +define { , } @ld2q_ss_i32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_i32( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_i64( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to i64 * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_i64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_i64( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_f16( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to half * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_f16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_f16( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_f32( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to float * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_f32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_f32( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_f64( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to double * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_f64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_f64( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( %pg, ptr %addr); + ret { , } %res +} + +define { , } @ld2q_si_bf16( %pg, *%addr ) { +; CHECK-LABEL: ld2q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + %base_ptr = bitcast * %base to bfloat * + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( %pg, ptr %base_ptr); + ret { , } %res +} + +define { , } @ld2q_ss_bf16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld2q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( %pg, ptr %addr2); + ret { , } %res +} + +define { , } @ld2q_bf16( %pg, ptr %addr) { +; CHECK-LABEL: ld2q_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( %pg, ptr %addr); + ret { , } %res +} + +;; LD3Q +define { , , } @ld3q_si_i8_off24( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_i8_off24: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to i8 * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_si_i8_off21( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_i8_off21: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + %base_ptr = bitcast * %base to i8 * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_i8( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_i8( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_i16( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to i16 * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_i16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_i16( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_i32( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to i32 * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_i32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_i32( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_i64( %pg, ptr %addr ) { +; CHECK-LABEL: ld3q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %addr2 = getelementptr , ptr %addr, i64 -24 + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( %pg, ptr %addr2); + ret {, , } %res +} + +define { , , } @ld3q_ss_i64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_i64( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_f16( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to half * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_f16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_f16( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_f32( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to float * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_f32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_f32( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_f64( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to double * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_f64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_f64( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( %pg, ptr %addr); + ret { , , } %res +} + +define { , , } @ld3q_si_bf16( %pg, *%addr ) { +; CHECK-LABEL: ld3q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + %base_ptr = bitcast * %base to bfloat * + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( %pg, ptr %base_ptr); + ret { , , } %res +} + +define { , , } @ld3q_ss_bf16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld3q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( %pg, ptr %addr2); + ret { , , } %res +} + +define { , , } @ld3q_bf16( %pg, ptr %addr) { +; CHECK-LABEL: ld3q_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( %pg, ptr %addr); + ret { , , } %res +} + +;; LD4Q +define { , , , } @ld4q_si_i8_off32( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_i8_off32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to i8 * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_i8_off28( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_i8_off28: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + %base_ptr = bitcast * %base to i8 * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_i8( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_i8( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_i16( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to i16 * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_i16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_i16( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_i32( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to i32 * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_i32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_i32( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_i64( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to i64 * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_i64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_i64( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_f16( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to half * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_f16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_f16( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_f32( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to float * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_f32( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_f32( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_f64( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to double * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_f64( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_f64( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( %pg, ptr %addr); + ret { , , , } %res +} + +define { , , , } @ld4q_si_bf16( %pg, *%addr ) { +; CHECK-LABEL: ld4q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + %base_ptr = bitcast * %base to bfloat * + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( %pg, ptr %base_ptr); + ret { , , , } %res +} + +define { , , , } @ld4q_ss_bf16( %pg, ptr %addr, i64 %a) { +; CHECK-LABEL: ld4q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %addr2 = getelementptr i128, ptr %addr, i64 %a + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( %pg, ptr %addr2); + ret { , , , } %res +} + +define { , , , } @ld4q_bf16( %pg, ptr %addr) { +; CHECK-LABEL: ld4q_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( %pg, ptr %addr); + ret { , , , } %res +} + + +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64(, ptr) + +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64(, ptr) +declare { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(, ptr) + +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64(, ptr) + +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64(, ptr) +declare { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(, ptr) + +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64(, ptr) + +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64(, ptr) +declare { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(, ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll @@ -0,0 +1,910 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; +; ST2Q +; +define void @st2q_ss_i8( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv16i8(%v0, %v1 , + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_i16( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv8i16( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_i32( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv4i32( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_i64( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv2i64( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_f16( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv8f16( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_f32( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv4f32( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_f64( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv2f64( %v0, + %v1, + %pred, + ptr %1) + ret void +} + +define void @st2q_ss_bf16( %v0, %v1, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st2q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st2q.nxv8bf16( %v0, + %v1, + %pred, + ptr %1) + ret void +} + + +define void @st2q_si_i8_off16( %v0, %v1, %pred, * %addr) { +; CHECK-LABEL: st2q_si_i8_off16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #-16, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -16 + call void @llvm.aarch64.sve.st2q.nxv16i8( %v0, + %v1, + %pred, + i8* %base) + ret void +} + +define void @st2q_si_i8_off14( %v0, %v1, %pred, * %addr) { +; CHECK-LABEL: st2q_si_i8_off14: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 14 + call void @llvm.aarch64.sve.st2q.nxv16i8( %v0, + %v1, + %pred, + i8* %base) + ret void +} + +define void @st2q_si_i16( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv8i16( %v0, + %v1, + %pred, + i8* %gep) + ret void +} + +define void @st2q_si_i32( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv4i32( %v0, + %v1, + %pred, + i32* %gep) + ret void +} + +define void @st2q_si_i64( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv2i64( %v0, + %v1, + %pred, + i64* %gep) + ret void +} + +define void @st2q_si_f16( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv8f16( %v0, + %v1, + %pred, + half* %gep) + ret void +} + +define void @st2q_si_f32( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv4f32( %v0, + %v1, + %pred, + float* %gep) + ret void +} + +define void @st2q_si_f64( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep= getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv2f64( %v0, + %v1, + %pred, + double* %gep) + ret void +} + +define void @st2q_si_bf16( %v0, %v1, %pred, ptr %base) { +; CHECK-LABEL: st2q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2q { z0.q, z1.q }, p0, [x0, #14, mul vl] +; CHECK-NEXT: ret + %gep = getelementptr , ptr %base, i64 14 + call void @llvm.aarch64.sve.st2q.nxv8bf16( %v0, + %v1, + %pred, + bfloat* %gep) + ret void +} + + +; +; ST3Q +; +define void @st3q_ss_i8( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv16i8(%v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_i16( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv8i16( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_i32( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv4i32( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_i64( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv2i64( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_f16( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv8f16( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_f32( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv4f32( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_f64( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv2f64( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_ss_bf16( %v0, %v1, %v2, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st3q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st3q.nxv8bf16( %v0, + %v1, + %v2, + %pred, + ptr %1) + ret void +} + +define void @st3q_si_i8_off24( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_i8_off24: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #-24, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -24 + call void @llvm.aarch64.sve.st3q.nxv16i8( %v0, + %v1, + %v2, + %pred, + i8* %base) + ret void +} + +define void @st3q_si_i8_off21( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_i8_off21: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv16i8( %v0, + %v1, + %v2, + %pred, + i8* %base) + ret void +} + +define void @st3q_si_i16( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv8i16( %v0, + %v1, + %v2, + %pred, + i8* %base) + ret void +} + +define void @st3q_si_i32( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv4i32( %v0, + %v1, + %v2, + %pred, + i32* %base) + ret void +} + +define void @st3q_si_i64( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv2i64( %v0, + %v1, + %v2, + %pred, + i64* %base) + ret void +} + +define void @st3q_si_f16( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv8f16( %v0, + %v1, + %v2, + %pred, + half* %base) + ret void +} + +define void @st3q_si_f32( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv4f32( %v0, + %v1, + %v2, + %pred, + float* %base) + ret void +} + +define void @st3q_si_f64( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv2f64( %v0, + %v1, + %v2, + %pred, + double* %base) + ret void +} + +define void @st3q_si_bf16( %v0, %v1, %v2, %pred, * %addr) { +; CHECK-LABEL: st3q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3q { z0.q - z2.q }, p0, [x0, #21, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 21 + call void @llvm.aarch64.sve.st3q.nxv8bf16( %v0, + %v1, + %v2, + %pred, + bfloat* %base) + ret void +} + +; +; ST4Q +; +define void @st4q_ss_i8( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv16i8(%v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_i16( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv8i16( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_i32( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv4i32( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_i64( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv2i64( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_f16( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv8f16( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_f32( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv4f32( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_f64( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv2f64( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_ss_bf16( %v0, %v1, %v2, %v3, %pred, ptr %addr, i64 %offset) { +; CHECK-LABEL: st4q_ss_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %1 = getelementptr i128, ptr %addr, i64 %offset + call void @llvm.aarch64.sve.st4q.nxv8bf16( %v0, + %v1, + %v2, + %v3, + %pred, + ptr %1) + ret void +} + +define void @st4q_si_i8_off32( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_i8_off32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #-32, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 -32 + call void @llvm.aarch64.sve.st4q.nxv16i8( %v0, + %v1, + %v2, + %v3, + %pred, + i8* %base) + ret void +} + +define void @st4q_si_i8_off28( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_i8_off28: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv16i8( %v0, + %v1, + %v2, + %v3, + %pred, + i8* %base) + ret void +} + +define void @st4q_si_i16( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv8i16( %v0, + %v1, + %v2, + %v3, + %pred, + i8* %base) + ret void +} + +define void @st4q_si_i32( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base1 = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv4i32( %v0, + %v1, + %v2, + %v3, + %pred, + i32* %base1) + ret void +} + +define void @st4q_si_i64( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv2i64( %v0, + %v1, + %v2, + %v3, + %pred, + i64* %base) + ret void +} + +define void @st4q_si_f16( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv8f16( %v0, + %v1, + %v2, + %v3, + %pred, + half* %base) + ret void +} + +define void @st4q_si_f32( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv4f32( %v0, + %v1, + %v2, + %v3, + %pred, + float* %base) + ret void +} + +define void @st4q_si_f64( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv2f64( %v0, + %v1, + %v2, + %v3, + %pred, + double* %base) + ret void +} + +define void @st4q_si_bf16( %v0, %v1, %v2, %v3, %pred, * %addr) { +; CHECK-LABEL: st4q_si_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4q { z0.q - z3.q }, p0, [x0, #28, mul vl] +; CHECK-NEXT: ret + %base = getelementptr , * %addr, i64 28 + call void @llvm.aarch64.sve.st4q.nxv8bf16( %v0, + %v1, + %v2, + %v3, + %pred, + bfloat* %base) + ret void +} + + +declare void @llvm.aarch64.sve.st2q.nxv16i8(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv8i16(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv4i32(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv2i64(, , , ptr) + +declare void @llvm.aarch64.sve.st2q.nxv8f16(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv4f32(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv2f64(, , , ptr) +declare void @llvm.aarch64.sve.st2q.nxv8bf16(, , , ptr) + +declare void @llvm.aarch64.sve.st3q.nxv16i8(, ,, , ptr) +declare void @llvm.aarch64.sve.st3q.nxv8i16(, , , , ptr) +declare void @llvm.aarch64.sve.st3q.nxv4i32(, , , , ptr) +declare void @llvm.aarch64.sve.st3q.nxv2i64(, , , , ptr) + +declare void @llvm.aarch64.sve.st3q.nxv8f16(, , , , ptr) +declare void @llvm.aarch64.sve.st3q.nxv4f32(, , , , ptr) +declare void @llvm.aarch64.sve.st3q.nxv2f64(, , , , ptr) +declare void @llvm.aarch64.sve.st3q.nxv8bf16(, , , , ptr) + +declare void @llvm.aarch64.sve.st4q.nxv16i8(, , , ,, ptr) +declare void @llvm.aarch64.sve.st4q.nxv8i16(, , , , , ptr) +declare void @llvm.aarch64.sve.st4q.nxv4i32(, , , ,, ptr) +declare void @llvm.aarch64.sve.st4q.nxv2i64(, , , , , ptr) + +declare void @llvm.aarch64.sve.st4q.nxv8f16(, , , , , ptr) +declare void @llvm.aarch64.sve.st4q.nxv4f32(, , , , , ptr) +declare void @llvm.aarch64.sve.st4q.nxv2f64(, , , , , ptr) +declare void @llvm.aarch64.sve.st4q.nxv8bf16(, , , , , ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_pmov_to_pred_i8( %zn) { +; CHECK-LABEL: test_pmov_to_pred_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov p0.b, z0 +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8( %zn, i32 0) + ret %res +} + +define @test_pmov_to_pred_i16( %zn) { +; CHECK-LABEL: test_pmov_to_pred_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: pmov p1.h, z0[0] +; CHECK-NEXT: pmov p2.h, z0[1] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: ret + entry: + %res1 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( %zn, i32 0) + %res2 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16( %zn, i32 1) + + %res = add %res1, %res2 + ret %res +} + +define @test_pmov_to_pred_i32( %zn) { +; CHECK-LABEL: test_pmov_to_pred_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: pmov p1.s, z0[0] +; CHECK-NEXT: pmov p2.s, z0[3] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: ret + entry: + %res1 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( %zn, i32 0) + %res2 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32( %zn, i32 3) + + %res = add %res1, %res2 + ret %res +} + +define @test_pmov_to_pred_i64( %zn) { +; CHECK-LABEL: test_pmov_to_pred_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: pmov p1.d, z0[0] +; CHECK-NEXT: pmov p2.d, z0[7] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: ret + entry: + %res1 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( %zn, i32 0) + %res2 = call @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64( %zn, i32 7) + + %res = add %res1, %res2 + ret %res +} + +declare @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(, i32) +declare @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(, i32) +declare @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(, i32) +declare @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(, i32) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; Merge + +define @test_pmov_to_vector_i16( %zn, %pn) { +; CHECK-LABEL: test_pmov_to_vector_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[1], p0.h +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16( %zn, %pn, i32 1) + ret %res +} + +define @test_pmov_to_vector_i32( %zn, %pn) { +; CHECK-LABEL: test_pmov_to_vector_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[3], p0.s +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32( %zn, %pn, i32 3) + ret %res +} + +define @test_pmov_to_vector_i64( %zn, %pn) { +; CHECK-LABEL: test_pmov_to_vector_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[7], p0.d +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64( %zn, %pn, i32 7) + ret %res +} + + +; Zero + +define @test_pmov_to_vector_zero_i8( %pn) { +; CHECK-LABEL: test_pmov_to_vector_zero_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0, p0.b +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8( %pn) + ret %res +} + +define @test_pmov_to_vector_zero_i16( %pn) { +; CHECK-LABEL: test_pmov_to_vector_zero_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[0], p0.h +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16( %pn) + ret %res +} + +define @test_pmov_to_vector_zero_i32( %pn) { +; CHECK-LABEL: test_pmov_to_vector_zero_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[0], p0.s +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32( %pn) + ret %res +} + +define @test_pmov_to_vector_zero_i64( %pn) { +; CHECK-LABEL: test_pmov_to_vector_zero_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmov z0[0], p0.d +; CHECK-NEXT: ret + entry: + %res = call @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64( %pn) + ret %res +} + +declare @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(, , i32) +declare @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(, , i32) +declare @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(, , i32) + +declare @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8() +declare @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16() +declare @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32() +declare @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64() diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s -; RUN: llc -mtriple=aarch64 -mattr=+sme2 < %s | FileCheck %s define @pext_b(target("aarch64.svcount") %x) nounwind { ; CHECK-LABEL: pext_b: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-qrshr.ll @@ -1,27 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+bf16 -verify-machineinstrs < %s | FileCheck %s ; ; S/UQRSHRN x2 ; -define @multi_vector_sat_shift_narrow_interleave_x2_s16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_narrow_interleave_x2_s16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_narrow_interleave_x2_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: sqrshrn z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sqrshrn z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sqrshrn.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res } -define @multi_vector_sat_shift_narrow_interleave_x2_u16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_narrow_interleave_x2_u16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_narrow_interleave_x2_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: uqrshrn z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: uqrshrn z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uqrshrn.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res @@ -31,12 +31,12 @@ ; SQRSHRUN x2 ; -define @multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16( %unused, %zn1, %zn2) { +define @multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16( %zn1, %zn2) { ; CHECK-LABEL: multi_vector_sat_shift_unsigned_narrow_interleave_x2_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: sqrshrun z0.h, { z2.s, z3.s }, #16 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sqrshrun z0.h, { z0.s, z1.s }, #16 ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sqrshrun.x2.nxv8i16( %zn1, %zn2, i32 16) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-scatter-stores-128bit-unscaled-offset.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; +; ST1Q: vector base + unscaled offset +; e.g. st1q { z0.q }, p0, [z1.d, x0] +; + +define void @sst1_scatter_u64base_offset_i8( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_i16( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_i32( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_i64( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_f16( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_f32( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_f64( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +define void @sst1_scatter_u64base_offset_bf16( %data, %pg, %b, i64 %offset) { +; CHECK-LABEL: sst1_scatter_u64base_offset_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: st1q { z0.q }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64( %data, + %pg, + %b, + i64 %offset) + ret void +} + +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv16i8.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8i16.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4i32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8f16.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv4f32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv2f64.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.st1q.scatter.scalar.offset.nxv8bf16.nxv2i64(, , , i64) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-sclamp.ll @@ -6,8 +6,7 @@ define @test_sclamp_i8( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.b, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.b, z1.b, z2.b ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv16i8( %a, %b, %c) ret %res @@ -16,8 +15,7 @@ define @test_sclamp_i16( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.h, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv8i16( %a, %b, %c) ret %res @@ -26,8 +24,7 @@ define @test_sclamp_i32( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.s, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv4i32( %a, %b, %c) ret %res @@ -36,8 +33,7 @@ define @test_sclamp_i64( %a, %b, %c) #0 { ; CHECK-LABEL: test_sclamp_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sclamp z2.d, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.sclamp.nxv2i64( %a, %b, %c) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +; ST1W + +define void @test_svst1uwq_i32_ss( %zt, %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svst1uwq_i32_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.q }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %gep = getelementptr i32, ptr %base, i64 %offset + call void @llvm.aarch64.sve.st1uwq.nxv4i32( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1uwq_i32_si( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1uwq_i32_si: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.q }, p0, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z0.q }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + call void @llvm.aarch64.sve.st1uwq.nxv4i32( %zt, %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + call void @llvm.aarch64.sve.st1uwq.nxv4i32( %zt, %pred, ptr %gep2) + ret void +} + +define void @test_svst1uwq_i32_out_of_bound( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1uwq_i32_out_of_bound: +; CHECK: // %bb.0: +; CHECK-NEXT: addvl x8, x0, #2 +; CHECK-NEXT: st1w { z0.q }, p0, [x8] +; CHECK-NEXT: ret + %gep = getelementptr inbounds , * %base, i64 8 + call void @llvm.aarch64.sve.st1uwq.nxv4i32( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1uwq_f32_ss( %zt, %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svst1uwq_f32_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.q }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %gep = getelementptr float, ptr %base, i64 %offset + call void @llvm.aarch64.sve.st1uwq.nxv4f32( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1uwq_f32_si( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1uwq_f32_si: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.q }, p0, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z0.q }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + call void @llvm.aarch64.sve.st1uwq.nxv4f32( %zt, %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + call void @llvm.aarch64.sve.st1uwq.nxv4f32( %zt, %pred, ptr %gep2) + ret void +} + +; ST1D + +define void @test_svst1udq_i64_ss( %zt, %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svst1udq_i64_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.q }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %gep = getelementptr i64, ptr %base, i64 %offset + call void @llvm.aarch64.sve.st1udq.nxv2i64( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1udq_i64_si( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1udq_i64_si: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.q }, p0, [x0, #-8, mul vl] +; CHECK-NEXT: st1d { z0.q }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + call void @llvm.aarch64.sve.st1udq.nxv2i64( %zt, %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + call void @llvm.aarch64.sve.st1udq.nxv2i64( %zt, %pred, ptr %gep2) + ret void +} + +define void @test_svst1udq_i64_out_of_bound( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1udq_i64_out_of_bound: +; CHECK: // %bb.0: +; CHECK-NEXT: addvl x8, x0, #-5 +; CHECK-NEXT: st1d { z0.q }, p0, [x8] +; CHECK-NEXT: ret + %gep = getelementptr inbounds , * %base, i64 -10 + call void @llvm.aarch64.sve.st1udq.nxv2i64( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1udq_f64_ss( %zt, %pred, ptr %base, i64 %offset) { +; CHECK-LABEL: test_svst1udq_f64_ss: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.q }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %gep = getelementptr double, ptr %base, i64 %offset + call void @llvm.aarch64.sve.st1udq.nxv2f64( %zt, %pred, ptr %gep) + ret void +} + +define void @test_svst1udq_f64_si( %zt, %pred, * %base) { +; CHECK-LABEL: test_svst1udq_f64_si: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.q }, p0, [x0, #-8, mul vl] +; CHECK-NEXT: st1d { z0.q }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %gep1 = getelementptr inbounds , * %base, i64 -8 + call void @llvm.aarch64.sve.st1udq.nxv2f64( %zt, %pred, ptr %gep1) + + %gep2 = getelementptr inbounds , * %base, i64 7 + call void @llvm.aarch64.sve.st1udq.nxv2f64( %zt, %pred, ptr %gep2) + ret void +} + +declare void @llvm.aarch64.sve.st1uwq.nxv4i32(, , ptr) +declare void @llvm.aarch64.sve.st1uwq.nxv4f32(, , ptr) + +declare void @llvm.aarch64.sve.st1udq.nxv2i64(, , ptr) +declare void @llvm.aarch64.sve.st1udq.nxv2f64(, , ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tblq.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tblq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tblq.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_tblq_i8 ( %zn, %zm) { +; CHECK-LABEL: test_tblq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.b, { z0.b }, z1.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv16i8( %zn, %zm) + ret %res +} + +define @test_tblq_i16 ( %zn, %zm) { +; CHECK-LABEL: test_tblq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.h, { z0.h }, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv8i16( %zn, %zm) + ret %res +} + +define @test_tblq_i32 ( %zn, %zm) { +; CHECK-LABEL: test_tblq_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.s, { z0.s }, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv4i32( %zn, %zm) + ret %res +} + +define @test_tblq_i64 ( %zn, %zm) { +; CHECK-LABEL: test_tblq_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.d, { z0.d }, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv2i64( %zn, %zm) + ret %res +} + +define @test_tblq_f16( %zn, %zm) { +; CHECK-LABEL: test_tblq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.h, { z0.h }, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv8f16( %zn, %zm) + ret %res +} + +define @test_tblq_f32( %zn, %zm) { +; CHECK-LABEL: test_tblq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.s, { z0.s }, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv4f32( %zn, %zm) + ret %res +} + +define @test_tblq_f64( %zn, %zm) { +; CHECK-LABEL: test_tblq_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.d, { z0.d }, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv2f64( %zn, %zm) + ret %res +} + +define @test_tblq_bf16( %zn, %zm) { +; CHECK-LABEL: test_tblq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: tblq z0.h, { z0.h }, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tblq.nxv8bf16( %zn, %zm) + ret %res +} + +declare @llvm.aarch64.sve.tblq.nxv16i8(, ) +declare @llvm.aarch64.sve.tblq.nxv8i16(, ) +declare @llvm.aarch64.sve.tblq.nxv4i32(, ) +declare @llvm.aarch64.sve.tblq.nxv2i64(, ) +declare @llvm.aarch64.sve.tblq.nxv8f16(, ) +declare @llvm.aarch64.sve.tblq.nxv4f32(, ) +declare @llvm.aarch64.sve.tblq.nxv2f64(, ) +declare @llvm.aarch64.sve.tblq.nxv8bf16(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tbxq.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tbxq.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-tbxq.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_tbxq_i8 ( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.b, z1.b, z2.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv16i8( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_i16 ( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.h, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv8i16( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_i32 ( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.s, z1.s, z2.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv4i32( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_i64 ( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.d, z1.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv2i64( %passthru, %zn, %zm) + ret %res +} + +define @test_tblq_f16( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tblq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.h, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv8f16( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_f32( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.s, z1.s, z2.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv4f32( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_f64( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.d, z1.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv2f64( %passthru, %zn, %zm) + ret %res +} + +define @test_tbxq_bf16( %passthru, %zn, %zm) { +; CHECK-LABEL: test_tbxq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: tbxq z0.h, z1.h, z2.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.tbxq.nxv8bf16( %passthru, %zn, %zm) + ret %res +} + +declare @llvm.aarch64.sve.tbxq.nxv16i8(, , ) +declare @llvm.aarch64.sve.tbxq.nxv8i16(, , ) +declare @llvm.aarch64.sve.tbxq.nxv4i32(, , ) +declare @llvm.aarch64.sve.tbxq.nxv2i64(, , ) +declare @llvm.aarch64.sve.tbxq.nxv8f16(, , ) +declare @llvm.aarch64.sve.tbxq.nxv4f32(, , ) +declare @llvm.aarch64.sve.tbxq.nxv2f64(, , ) +declare @llvm.aarch64.sve.tbxq.nxv8bf16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uclamp.ll @@ -6,8 +6,7 @@ define @test_uclamp_i8( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.b, z0.b, z1.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.b, z1.b, z2.b ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv16i8( %a, %b, %c) ret %res @@ -16,8 +15,7 @@ define @test_uclamp_i16( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.h, z0.h, z1.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.h, z1.h, z2.h ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv8i16( %a, %b, %c) ret %res @@ -26,8 +24,7 @@ define @test_uclamp_i32( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.s, z0.s, z1.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.s, z1.s, z2.s ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv4i32( %a, %b, %c) ret %res @@ -36,8 +33,7 @@ define @test_uclamp_i64( %a, %b, %c) #0 { ; CHECK-LABEL: test_uclamp_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uclamp z2.d, z0.d, z1.d -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uclamp z0.d, z1.d, z2.d ; CHECK-NEXT: ret %res = call @llvm.aarch64.sve.uclamp.nxv2i64( %a, %b, %c) ret %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq1.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq1.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_uzpq1_i8( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv16i8( %zn, %zm) + ret %res +} + +define @test_uzpq1_i16( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv8i16( %zn, %zm) + ret %res +} + +define @test_uzpq1_i32( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv4i32( %zn, %zm) + ret %res +} + +define @test_uzpq1_i64( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv2i64( %zn, %zm) + ret %res +} + +define @test_uzpq1_f16( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv8f16( %zn, %zm) + ret %res +} + +define @test_uzpq1_f32( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv4f32( %zn, %zm) + ret %res +} + +define @test_uzpq1_f64( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv2f64( %zn, %zm) + ret %res +} + +define @test_uzpq1_bf16( %zn, %zm) { +; CHECK-LABEL: test_uzpq1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq1.nxv8bf16( %zn, %zm) + ret %res +} + + +declare @llvm.aarch64.sve.uzpq1.nxv16i8(, ) +declare @llvm.aarch64.sve.uzpq1.nxv8i16(, ) +declare @llvm.aarch64.sve.uzpq1.nxv4i32(, ) +declare @llvm.aarch64.sve.uzpq1.nxv2i64(, ) + +declare @llvm.aarch64.sve.uzpq1.nxv8f16(, ) +declare @llvm.aarch64.sve.uzpq1.nxv4f32(, ) +declare @llvm.aarch64.sve.uzpq1.nxv2f64(, ) +declare @llvm.aarch64.sve.uzpq1.nxv8bf16(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpq2.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_uzpq2_i8( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv16i8( %zn, %zm) + ret %res +} + +define @test_uzpq2_i16( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv8i16( %zn, %zm) + ret %res +} + +define @test_uzpq2_i32( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv4i32( %zn, %zm) + ret %res +} + +define @test_uzpq2_i64( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv2i64( %zn, %zm) + ret %res +} + +define @test_uzpq2_f16( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv8f16( %zn, %zm) + ret %res +} + +define @test_uzpq2_f32( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv4f32( %zn, %zm) + ret %res +} + +define @test_uzpq2_f64( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv2f64( %zn, %zm) + ret %res +} + +define @test_uzpq2_bf16( %zn, %zm) { +; CHECK-LABEL: test_uzpq2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzpq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uzpq2.nxv8bf16( %zn, %zm) + ret %res +} + + +declare @llvm.aarch64.sve.uzpq2.nxv16i8(, ) +declare @llvm.aarch64.sve.uzpq2.nxv8i16(, ) +declare @llvm.aarch64.sve.uzpq2.nxv4i32(, ) +declare @llvm.aarch64.sve.uzpq2.nxv2i64(, ) + +declare @llvm.aarch64.sve.uzpq2.nxv8f16(, ) +declare @llvm.aarch64.sve.uzpq2.nxv4f32(, ) +declare @llvm.aarch64.sve.uzpq2.nxv2f64(, ) +declare @llvm.aarch64.sve.uzpq2.nxv8bf16(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll @@ -3,40 +3,31 @@ ; == 8 to 64-bit elements == -define @uzp_x2_i8( %zn, %zm) nounwind { +define { , } @uzp_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.b, z3.b }, z0.b, z1.b -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: uzp { z0.b, z1.b }, z0.b, z1.b ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) + ret { , } %res } -define @uzp_x2_i16( %zn, %zm) nounwind { +define { , } @uzp_x2_i16( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) + ret { , } %res } -define @uzp_x2_f16( %zn, %zm) nounwind { +define { , } @uzp_x2_f16( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: fadd z0.h, z2.h, z0.h +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) + ret { , } %res } define { , } @uzp_x2_bf16( %zn, %zm) nounwind { @@ -44,74 +35,56 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) - ret { , } %uzp + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) + ret { , } %res } -define @uzp_x2_i32( %zn, %zm) nounwind { +define { , } @uzp_x2_i32( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) + ret { , } %res } -define @uzp_x2_f32( %zn, %zm) nounwind { +define { , } @uzp_x2_f32( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: fadd z0.s, z2.s, z0.s +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) + ret { , } %res } -define @uzp_x2_i64( %zn, %zm) nounwind { +define { , } @uzp_x2_i64( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) + ret { , } %res } -define @uzp_x2_f64( %zn, %zm) nounwind { +define { , } @uzp_x2_f64( %zn, %zm) nounwind { ; CHECK-LABEL: uzp_x2_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: fadd z0.d, z2.d, z0.d +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = fadd %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) + ret { , } %res } ; == 128-bit elements == -; NOTE: For the 128-bit case we only need to check the to -; ensure the tuple result starts at the correct register multiple. The other -; variants all test the same code path. -define @uzpq_x2_i8( %zn, %zm) nounwind { +define { , } @uzpq_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: uzpq_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z2.q, z3.q }, z0.q, z1.q -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q ; CHECK-NEXT: ret - %uzp = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) - %uzp0 = extractvalue { , } %uzp, 0 - %add = add %uzp0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) + ret { , } %res } define { , } @uzpq_x2_i16( %zn, %zm) nounwind { @@ -177,18 +150,9 @@ ret { , } %res } -define { , } @uzpq_x2_i8_not_tied( %unused, %zn, %zm) nounwind { -; CHECK-LABEL: uzpq_x2_i8_not_tied: -; CHECK: // %bb.0: -; CHECK-NEXT: uzp { z0.q, z1.q }, z1.q, z2.q -; CHECK-NEXT: ret - %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) - ret { , } %res -} - ; == 8 to 64-bit elements == -declare { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv16i8(, ) declare { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) declare { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) declare { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll @@ -3,105 +3,105 @@ ; == 8 to 64-bit elements == -define { , , , } @uzp_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.b - z3.b }, { z4.b - z7.b } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.b - z3.b }, { z0.b - z3.b } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @uzp_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @uzp_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: uzp_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z27.d, z5.d -; CHECK-NEXT: mov z26.d, z4.d -; CHECK-NEXT: mov z25.d, z3.d -; CHECK-NEXT: mov z24.d, z2.d -; CHECK-NEXT: uzp { z0.d - z3.d }, { z24.d - z27.d } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res @@ -110,105 +110,105 @@ ; == 128-bit elements == -define { , , , } @zipq_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z7.d, z4.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res } -define { , , , } @zipq_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +define { , , , } @zipq_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { ; CHECK-LABEL: zipq_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z27.d, z5.d -; CHECK-NEXT: mov z26.d, z4.d -; CHECK-NEXT: mov z25.d, z3.d -; CHECK-NEXT: mov z24.d, z2.d -; CHECK-NEXT: uzp { z0.q - z3.q }, { z24.q - z27.q } +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } ; CHECK-NEXT: ret %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) ret { , , , } %res diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll @@ -377,7 +377,7 @@ } -; Test that we get good code quality when using while in combination with other intrinsics +; Test that we get good code quality for code generated from ACLE builtins define @codegen_whilege_b16_x2(i64 noundef %op1, i64 noundef %op2) nounwind { ; CHECK-LABEL: codegen_whilege_b16_x2: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq1.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq1.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_zipq1_i8( %zn, %zm) { +; CHECK-LABEL: test_zipq1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv16i8( %zn, %zm) + ret %res +} + +define @test_zipq1_i16( %zn, %zm) { +; CHECK-LABEL: test_zipq1_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv8i16( %zn, %zm) + ret %res +} + +define @test_zipq1_i32( %zn, %zm) { +; CHECK-LABEL: test_zipq1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv4i32( %zn, %zm) + ret %res +} + +define @test_zipq1_i64( %zn, %zm) { +; CHECK-LABEL: test_zipq1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv2i64( %zn, %zm) + ret %res +} + +define @test_zipq1_f16( %zn, %zm) { +; CHECK-LABEL: test_zipq1_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv8f16( %zn, %zm) + ret %res +} + +define @test_zipq1_f32( %zn, %zm) { +; CHECK-LABEL: test_zipq1_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv4f32( %zn, %zm) + ret %res +} + +define @test_zipq1_f64( %zn, %zm) { +; CHECK-LABEL: test_zipq1_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv2f64( %zn, %zm) + ret %res +} + +define @test_zipq1_bf16( %zn, %zm) { +; CHECK-LABEL: test_zipq1_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq1.nxv8bf16( %zn, %zm) + ret %res +} + + +declare @llvm.aarch64.sve.zipq1.nxv16i8(, ) +declare @llvm.aarch64.sve.zipq1.nxv8i16(, ) +declare @llvm.aarch64.sve.zipq1.nxv4i32(, ) +declare @llvm.aarch64.sve.zipq1.nxv2i64(, ) + +declare @llvm.aarch64.sve.zipq1.nxv8f16(, ) +declare @llvm.aarch64.sve.zipq1.nxv4f32(, ) +declare @llvm.aarch64.sve.zipq1.nxv2f64(, ) +declare @llvm.aarch64.sve.zipq1.nxv8bf16(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipq2.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s + +define @test_zipq2_i8( %zn, %zm) { +; CHECK-LABEL: test_zipq2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv16i8( %zn, %zm) + ret %res +} + +define @test_zipq2_i16( %zn, %zm) { +; CHECK-LABEL: test_zipq2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv8i16( %zn, %zm) + ret %res +} + +define @test_zipq2_i32( %zn, %zm) { +; CHECK-LABEL: test_zipq2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv4i32( %zn, %zm) + ret %res +} + +define @test_zipq2_i64( %zn, %zm) { +; CHECK-LABEL: test_zipq2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv2i64( %zn, %zm) + ret %res +} + +define @test_zipq2_f16( %zn, %zm) { +; CHECK-LABEL: test_zipq2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv8f16( %zn, %zm) + ret %res +} + +define @test_zipq2_f32( %zn, %zm) { +; CHECK-LABEL: test_zipq2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv4f32( %zn, %zm) + ret %res +} + +define @test_zipq2_f64( %zn, %zm) { +; CHECK-LABEL: test_zipq2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv2f64( %zn, %zm) + ret %res +} + +define @test_zipq2_bf16( %zn, %zm) { +; CHECK-LABEL: test_zipq2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: zipq2 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.zipq2.nxv8bf16( %zn, %zm) + ret %res +} + + +declare @llvm.aarch64.sve.zipq2.nxv16i8(, ) +declare @llvm.aarch64.sve.zipq2.nxv8i16(, ) +declare @llvm.aarch64.sve.zipq2.nxv4i32(, ) +declare @llvm.aarch64.sve.zipq2.nxv2i64(, ) + +declare @llvm.aarch64.sve.zipq2.nxv8f16(, ) +declare @llvm.aarch64.sve.zipq2.nxv4f32(, ) +declare @llvm.aarch64.sve.zipq2.nxv2f64(, ) +declare @llvm.aarch64.sve.zipq2.nxv8bf16(, ) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll @@ -3,40 +3,31 @@ ; == 8 to 64-bit elements == -define @zip_x2_i8( %zn, %zm) nounwind { +define { , } @zip_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.b, z3.b }, z0.b, z1.b -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: zip { z0.b, z1.b }, z0.b, z1.b ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res } -define @zip_x2_i16( %zn, %zm) nounwind { +define { , } @zip_x2_i16( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) + ret { , } %res } -define @zip_x2_f16( %zn, %zm) nounwind { +define { , } @zip_x2_f16( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.h, z3.h }, z0.h, z1.h -; CHECK-NEXT: fadd z0.h, z2.h, z0.h +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) + ret { , } %res } define { , } @zip_x2_bf16( %zn, %zm) nounwind { @@ -44,74 +35,65 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) - ret { , } %zip + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) + ret { , } %res } -define @zip_x2_i32( %zn, %zm) nounwind { +define { , } @zip_x2_i32( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) + ret { , } %res } -define @zip_x2_f32( %zn, %zm) nounwind { +define { , } @zip_x2_f32( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.s, z3.s }, z0.s, z1.s -; CHECK-NEXT: fadd z0.s, z2.s, z0.s +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) + ret { , } %res } -define @zip_x2_i64( %zn, %zm) nounwind { +define { , } @zip_x2_i64( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) + ret { , } %res } -define @zip_x2_f64( %zn, %zm) nounwind { +define { , } @zip_x2_f64( %zn, %zm) nounwind { ; CHECK-LABEL: zip_x2_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.d, z3.d }, z0.d, z1.d -; CHECK-NEXT: fadd z0.d, z2.d, z0.d +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = fadd %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i8_not_tied( %unused, %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i8_not_tied: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.b, z1.b }, z1.b, z2.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res } ; == 128-bit elements == -; NOTE: For the 128-bit case we only need to check the to -; ensure the tuple result starts at the correct register multiple. The other -; variants all test the same code path. -define @zipq_x2_i8( %zn, %zm) nounwind { +define { , } @zipq_x2_i8( %zn, %zm) nounwind { ; CHECK-LABEL: zipq_x2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: zip { z2.q, z3.q }, z0.q, z1.q -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q ; CHECK-NEXT: ret - %zip = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) - %zip0 = extractvalue { , } %zip, 0 - %add = add %zip0, %zn - ret %add + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) + ret { , } %res } define { , } @zipq_x2_i16( %zn, %zm) nounwind { diff --git a/llvm/test/CodeGen/AArch64/variant-pcs.ll b/llvm/test/CodeGen/AArch64/variant-pcs.ll --- a/llvm/test/CodeGen/AArch64/variant-pcs.ll +++ b/llvm/test/CodeGen/AArch64/variant-pcs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s --check-prefix=CHECK-ASM --strict-whitespace -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -filetype=obj -o - %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -o - %s | FileCheck %s --check-prefix=CHECK-ASM --strict-whitespace +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -filetype=obj -o - %s \ ; RUN: | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-OBJ define i32 @base_pcs() { @@ -49,3 +49,40 @@ ; CHECK-OBJ: Other [ (0x80) ret void } + +define void @variant_pcs_sme_streaming() "aarch64_pstate_sm_enabled" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_streaming +; CHECK-ASM-NEXT: variant_pcs_sme_streaming: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_streaming +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_streaming_compatible() "aarch64_pstate_sm_compatible" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_streaming_compatible +; CHECK-ASM-NEXT: variant_pcs_sme_streaming_compatible: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_streaming_compatible +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_shared_za() "aarch64_pstate_za_shared" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_shared_za +; CHECK-ASM-NEXT: variant_pcs_sme_shared_za: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_shared_za +; CHECK-OBJ: Other [ (0x80) + ret void +} + +define void @variant_pcs_sme_new_za() "aarch64_pstate_za_new" { +; CHECK-ASM-NOT: .variant_pcs variant_pcs_sme_new_za + ret void +} + +define void @variant_pcs_sme_preserves_za() "aarch64_pstate_za_shared" "aarch64_pstate_za_preserved" { +; CHECK-ASM: .variant_pcs variant_pcs_sme_preserves_za +; CHECK-ASM-NEXT: variant_pcs_sme_preserves_za: +; CHECK-OBJ-LABEL: Name: variant_pcs_sme_preserves_za +; CHECK-OBJ: Other [ (0x80) + ret void +} diff --git a/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s b/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p1/feature-sve2p1-implies-bf16.s @@ -0,0 +1,7 @@ +// This test verifies SVE2p1 implies bf16. + +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p1 2>&1 < %s \ +// RUN: | FileCheck %s + +bfcvt z0.h, p0/m, z1.s +// CHECK: bfcvt z0.h, p0/m, z1.s \ No newline at end of file diff --git a/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll b/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/sme-streaming-attr.ll @@ -0,0 +1,339 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s + +declare void @inlined_streaming_body() "aarch64_pstate_sm_enabled"; +declare void @inlined_locally_streaming_body() "aarch64_pstate_sm_body"; +declare void @inlined_streaming_compatible_body() "aarch64_pstate_sm_compatible"; +declare void @inlined_streaming_compatible_locally_streaming_body() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body"; +declare void @inlined_normal_body(); + +define void @streaming_callee() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_body() + ret void +} + +define void @locally_streaming_callee() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_locally_streaming_body() + ret void +} + +define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_compatible_body() + ret void +} + +define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_streaming_compatible_locally_streaming_body() + ret void +} + +define void @normal_callee() { +; CHECK-LABEL: @normal_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_normal_body() + ret void +} + +define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_streaming_compatible_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: @streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" { +; CHECK-LABEL: @locally_streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: @streaming_compatible_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @normal_caller_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @normal_caller_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @normal_caller_streaming_compatible_callee_inline() { +; CHECK-LABEL: @normal_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @streaming_compatible_locally_streaming_callee() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @normal_caller_normal_callee_inline() { +; CHECK-LABEL: @normal_caller_normal_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + + +define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @locally_streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_and_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_streaming_compatible_locally_streaming_body() +; CHECK-NEXT: ret void +; +entry: + call void @streaming_compatible_locally_streaming_callee() + ret void +} + +define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" { +; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} diff --git a/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll b/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/sme-za-attr.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline < %s | FileCheck %s + +declare void @inlined_normal_body(); +declare void @inlined_shared_za_body() "aarch64_pstate_za_shared"; +declare void @inlined_new_za_body() "aarch64_pstate_za_new"; + +define void @normal_callee() { +; CHECK-LABEL: @normal_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_normal_body() + ret void +} + +define void @shared_za_callee() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_callee( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @inlined_shared_za_body() + ret void +} + +define void @new_za_callee() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_callee( +; CHECK-NEXT: call void @inlined_new_za_body() +; CHECK-NEXT: ret void +; + call void @inlined_new_za_body() + ret void +} + +define void @normal_caller_normal_callee_inline() { +; CHECK-LABEL: @normal_caller_normal_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_normal_body() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @normal_caller_new_za_callee_dont_inline() { +; CHECK-LABEL: @normal_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +} + +define void @new_za_caller_normal_callee_dont_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @new_za_caller_shared_za_callee_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_shared_za_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @shared_za_callee() + ret void +} + +define void @new_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_new" { +; CHECK-LABEL: @new_za_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +} + +define void @shared_za_caller_normal_callee_dont_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_normal_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @normal_callee() +; CHECK-NEXT: ret void +; +entry: + call void @normal_callee() + ret void +} + +define void @shared_za_caller_shared_za_callee_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_shared_za_callee_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @inlined_shared_za_body() +; CHECK-NEXT: ret void +; +entry: + call void @shared_za_callee() + ret void +} + +define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_shared" { +; CHECK-LABEL: @shared_za_caller_new_za_callee_dont_inline( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @new_za_callee() +; CHECK-NEXT: ret void +; +entry: + call void @new_za_callee() + ret void +}